diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 00c6f90e1c864..23d5490a58b7e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6036,6 +6036,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { HandOpcode == ISD::ANY_EXTEND_VECTOR_INREG) && LegalTypes && !TLI.isTypeDesirableForOp(LogicOpcode, XVT)) return SDValue(); + // Prevent an infinite loop if the target prefers the inverse + // transformation. + if (TLI.isNarrowingProfitable(N, XVT, VT)) + return SDValue(); // logic_op (hand_op X), (hand_op Y) --> hand_op (logic_op X, Y) SDNodeFlags LogicFlags; LogicFlags.setDisjoint(N->getFlags().hasDisjoint() && @@ -6048,6 +6052,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y) if (HandOpcode == ISD::TRUNCATE) { + // Don't create a logic op on an illegal type. + if (!TLI.isTypeLegal(XVT)) + return SDValue(); // If both operands have other uses, this transform would create extra // instructions without eliminating anything. if (!N0.hasOneUse() && !N1.hasOneUse()) @@ -6059,10 +6066,12 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) { if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT)) return SDValue(); // Be extra careful sinking truncate. If it's free, there's no benefit in - // widening a binop. Also, don't create a logic op on an illegal type. + // widening a binop. if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT)) return SDValue(); - if (!TLI.isTypeLegal(XVT)) + // Prevent an infinite loop if the target prefers the inverse + // transformation. + if (TLI.isNarrowingProfitable(N, XVT, VT)) return SDValue(); SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y); return DAG.getNode(HandOpcode, DL, VT, Logic); @@ -15869,6 +15878,28 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) { break; } + if (!LegalOperations || TLI.isOperationLegal(N0.getOpcode(), VT)) { + switch (N0.getOpcode()) { + case ISD::ADD: + case ISD::SUB: + case ISD::MUL: + case ISD::AND: + case ISD::OR: + case ISD::XOR: + if (!N0.hasOneUse() || !VT.isScalarInteger()) + break; + if (!TLI.isNarrowingProfitable(N0.getNode(), SrcVT, VT)) + break; + SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0)); + SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1)); + SDValue TruncatedOp = + DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR); + if (TLI.IsDesirableToPromoteOp(TruncatedOp, SrcVT)) + break; + return TruncatedOp; + } + } + return SDValue(); } diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index b6023b4f3fbcf..f120004bcc879 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -1040,6 +1040,8 @@ bool AMDGPUTargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, case ISD::MUL: case ISD::SETCC: case ISD::SELECT: + if (DestVT.getScalarSizeInBits() == 1) + return false; if (Subtarget->has16BitInsts() && (!DestVT.isVector() || !Subtarget->hasVOP3PInsts())) { // Don't narrow back down to i16 if promoted to i32 already. diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 7a8bf3bf33a94..6df00d582a763 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -162,6 +162,14 @@ class NVPTXTargetLowering : public TargetLowering { DstTy->getPrimitiveSizeInBits() == 32; } + bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override { + // Truncating 64-bit to 32-bit is free in SASS. + if (!SrcVT.isScalarInteger() || !DestVT.isScalarInteger()) + return false; + return SrcVT.getFixedSizeInBits() == 64 && + DestVT.getFixedSizeInBits() == 32; + } + EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx, EVT VT) const override { if (VT.isVector()) diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 11d77599d4ac3..1d6d6ddb40b73 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -1438,8 +1438,8 @@ def : Pat<(mul i1:$a, imm:$b), (ANDb1ri $a, imm:$b)>; // These transformations were once reliably performed by instcombine, but thanks // to poison semantics they are no longer safe for LLVM IR, perform them here // instead. -def : Pat<(select i1:$a, i1:$b, 0), (ANDb1rr $a, $b)>; -def : Pat<(select i1:$a, 1, i1:$b), (ORb1rr $a, $b)>; +def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>; +def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>; // Lower logical v2i16/v4i8 ops as bitwise ops on b32. foreach vt = [v2i16, v4i8] in { diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 3d9c76f3d05f5..750aa2903c75e 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -35425,6 +35425,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF, bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const { // i16 instructions are longer (0x66 prefix) and potentially slower. + if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger())) + return false; return !(SrcVT == MVT::i32 && DestVT == MVT::i16); } diff --git a/llvm/test/CodeGen/AMDGPU/add_i1.ll b/llvm/test/CodeGen/AMDGPU/add_i1.ll index ff1a3ee38be1d..961a37a4aad8c 100644 --- a/llvm/test/CodeGen/AMDGPU/add_i1.ll +++ b/llvm/test/CodeGen/AMDGPU/add_i1.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX9 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,GFX10 %s @@ -6,6 +7,20 @@ ; GFX9: v_xor_b32_e32 ; GFX10: v_xor_b32_e32 define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; GFX9-LABEL: add_var_var_i1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_ubyte v2, v0, s[6:7] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %add = add i1 %a, %b @@ -17,6 +32,17 @@ define amdgpu_kernel void @add_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9: s_xor_b64 ; GFX10: s_xor_b32 define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; GFX9-LABEL: add_var_imm_i1: +; GFX9: ; %bb.0: +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v1, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in %add = add i1 %a, 1 store i1 %add, ptr addrspace(1) %out @@ -28,6 +54,44 @@ define amdgpu_kernel void @add_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1 ; GFX9: s_xor_b64 ; GFX10: s_xor_b32 define amdgpu_kernel void @add_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; GFX9-LABEL: add_i1_cf: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; GFX9-NEXT: ; implicit-def: $sgpr4_sgpr5 +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], vcc +; GFX9-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB2_2 +; GFX9-NEXT: ; %bb.1: ; %else +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v0, v0, s[8:9] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; GFX9-NEXT: .LBB2_2: ; %Flow +; GFX9-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; GFX9-NEXT: s_cbranch_execz .LBB2_4 +; GFX9-NEXT: ; %bb.3: ; %if +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_ubyte v0, v0, s[2:3] glc +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: s_andn2_b64 s[2:3], s[4:5], exec +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_and_b64 s[4:5], vcc, exec +; GFX9-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; GFX9-NEXT: .LBB2_4: ; %endif +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] +; GFX9-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -49,3 +113,6 @@ endif: } declare i32 @llvm.amdgcn.workitem.id.x() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; GFX10: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll index 3160e38df5e3f..85cb6fa297166 100644 --- a/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/agpr-copy-no-free-registers.ll @@ -565,8 +565,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_cbranch_vccz .LBB3_12 ; GFX908-NEXT: .LBB3_2: ; %bb9 ; GFX908-NEXT: ; =>This Loop Header: Depth=1 -; GFX908-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX908-NEXT: s_mov_b64 s[18:19], -1 +; GFX908-NEXT: ; Child Loop BB3_6 Depth 2 +; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: s_mov_b64 vcc, s[0:1] ; GFX908-NEXT: s_cbranch_vccz .LBB3_10 ; GFX908-NEXT: ; %bb.3: ; %bb14 @@ -597,18 +597,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: s_add_i32 s13, s22, s13 ; GFX908-NEXT: s_mul_i32 s9, s6, s9 ; GFX908-NEXT: s_add_i32 s13, s13, s23 -; GFX908-NEXT: s_branch .LBB3_5 +; GFX908-NEXT: s_branch .LBB3_6 ; GFX908-NEXT: .LBB3_4: ; %bb58 -; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2 ; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX908-NEXT: s_add_u32 s20, s20, s4 ; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3] ; GFX908-NEXT: s_addc_u32 s21, s21, s5 ; GFX908-NEXT: s_mov_b64 s[22:23], 0 +; GFX908-NEXT: .LBB3_5: ; %Flow18 +; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2 +; GFX908-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[22:23] +; GFX908-NEXT: v_readfirstlane_b32 s22, v12 +; GFX908-NEXT: s_not_b32 s22, s22 +; GFX908-NEXT: s_bitcmp1_b32 s22, 0 +; GFX908-NEXT: s_cselect_b64 s[22:23], -1, 0 ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; GFX908-NEXT: s_cbranch_vccz .LBB3_9 -; GFX908-NEXT: .LBB3_5: ; %bb16 +; GFX908-NEXT: s_cbranch_vccz .LBB3_10 +; GFX908-NEXT: .LBB3_6: ; %bb16 ; GFX908-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX908-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX908-NEXT: s_add_u32 s22, s20, s9 @@ -625,9 +632,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: ds_read_b64 v[14:15], v0 ; GFX908-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: s_cbranch_vccnz .LBB3_7 -; GFX908-NEXT: ; %bb.6: ; %bb51 -; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: s_cbranch_vccnz .LBB3_8 +; GFX908-NEXT: ; %bb.7: ; %bb51 +; GFX908-NEXT: ; in Loop: Header=BB3_6 Depth=2 ; GFX908-NEXT: v_cvt_f32_f16_sdwa v22, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX908-NEXT: v_cvt_f32_f16_e32 v21, v21 ; GFX908-NEXT: v_cvt_f32_f16_sdwa v23, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -649,21 +656,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX908-NEXT: v_add_f32_e32 v10, v10, v12 ; GFX908-NEXT: v_add_f32_e32 v11, v11, v13 ; GFX908-NEXT: s_branch .LBB3_4 -; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 +; GFX908-NEXT: .LBB3_8: ; in Loop: Header=BB3_6 Depth=2 ; GFX908-NEXT: s_mov_b64 s[22:23], s[18:19] ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_4 -; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 +; GFX908-NEXT: ; %bb.9: ; in Loop: Header=BB3_6 Depth=2 ; GFX908-NEXT: s_mov_b64 s[22:23], -1 ; GFX908-NEXT: ; implicit-def: $vgpr2_vgpr3 ; GFX908-NEXT: ; implicit-def: $sgpr20_sgpr21 -; GFX908-NEXT: .LBB3_9: ; %loop.exit.guard -; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX908-NEXT: s_xor_b64 s[18:19], s[22:23], -1 +; GFX908-NEXT: s_mov_b64 s[24:25], -1 +; GFX908-NEXT: s_branch .LBB3_5 ; GFX908-NEXT: .LBB3_10: ; %Flow19 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX908-NEXT: s_mov_b64 s[2:3], -1 -; GFX908-NEXT: s_and_b64 vcc, exec, s[18:19] +; GFX908-NEXT: s_and_b64 vcc, exec, s[22:23] ; GFX908-NEXT: s_cbranch_vccz .LBB3_1 ; GFX908-NEXT: ; %bb.11: ; %bb12 ; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1 @@ -730,8 +736,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_cbranch_vccz .LBB3_12 ; GFX90A-NEXT: .LBB3_2: ; %bb9 ; GFX90A-NEXT: ; =>This Loop Header: Depth=1 -; GFX90A-NEXT: ; Child Loop BB3_5 Depth 2 -; GFX90A-NEXT: s_mov_b64 s[18:19], -1 +; GFX90A-NEXT: ; Child Loop BB3_6 Depth 2 +; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: s_mov_b64 vcc, s[0:1] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 ; GFX90A-NEXT: ; %bb.3: ; %bb14 @@ -758,18 +764,25 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_add_i32 s13, s22, s13 ; GFX90A-NEXT: s_mul_i32 s9, s6, s9 ; GFX90A-NEXT: s_add_i32 s13, s13, s23 -; GFX90A-NEXT: s_branch .LBB3_5 +; GFX90A-NEXT: s_branch .LBB3_6 ; GFX90A-NEXT: .LBB3_4: ; %bb58 -; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2 ; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc ; GFX90A-NEXT: s_add_u32 s20, s20, s4 ; GFX90A-NEXT: s_addc_u32 s21, s21, s5 ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5] ; GFX90A-NEXT: s_mov_b64 s[22:23], 0 +; GFX90A-NEXT: .LBB3_5: ; %Flow18 +; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2 +; GFX90A-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[22:23] +; GFX90A-NEXT: v_readfirstlane_b32 s22, v14 +; GFX90A-NEXT: s_not_b32 s22, s22 +; GFX90A-NEXT: s_bitcmp1_b32 s22, 0 +; GFX90A-NEXT: s_cselect_b64 s[22:23], -1, 0 ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25] -; GFX90A-NEXT: s_cbranch_vccz .LBB3_9 -; GFX90A-NEXT: .LBB3_5: ; %bb16 +; GFX90A-NEXT: s_cbranch_vccz .LBB3_10 +; GFX90A-NEXT: .LBB3_6: ; %bb16 ; GFX90A-NEXT: ; Parent Loop BB3_2 Depth=1 ; GFX90A-NEXT: ; => This Inner Loop Header: Depth=2 ; GFX90A-NEXT: s_add_u32 s22, s20, s9 @@ -787,9 +800,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: s_and_b64 vcc, exec, s[2:3] ; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7 -; GFX90A-NEXT: ; %bb.6: ; %bb51 -; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: s_cbranch_vccnz .LBB3_8 +; GFX90A-NEXT: ; %bb.7: ; %bb51 +; GFX90A-NEXT: ; in Loop: Header=BB3_6 Depth=2 ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v23, v21 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; GFX90A-NEXT: v_cvt_f32_f16_e32 v22, v21 ; GFX90A-NEXT: v_cvt_f32_f16_sdwa v21, v20 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -803,21 +816,20 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg ; GFX90A-NEXT: v_pk_add_f32 v[10:11], v[10:11], v[16:17] ; GFX90A-NEXT: v_pk_add_f32 v[12:13], v[12:13], v[14:15] ; GFX90A-NEXT: s_branch .LBB3_4 -; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2 +; GFX90A-NEXT: .LBB3_8: ; in Loop: Header=BB3_6 Depth=2 ; GFX90A-NEXT: s_mov_b64 s[22:23], s[18:19] ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_4 -; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1 +; GFX90A-NEXT: ; %bb.9: ; in Loop: Header=BB3_6 Depth=2 ; GFX90A-NEXT: s_mov_b64 s[22:23], -1 ; GFX90A-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GFX90A-NEXT: ; implicit-def: $sgpr20_sgpr21 -; GFX90A-NEXT: .LBB3_9: ; %loop.exit.guard -; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 -; GFX90A-NEXT: s_xor_b64 s[18:19], s[22:23], -1 +; GFX90A-NEXT: s_mov_b64 s[24:25], -1 +; GFX90A-NEXT: s_branch .LBB3_5 ; GFX90A-NEXT: .LBB3_10: ; %Flow19 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 ; GFX90A-NEXT: s_mov_b64 s[2:3], -1 -; GFX90A-NEXT: s_and_b64 vcc, exec, s[18:19] +; GFX90A-NEXT: s_and_b64 vcc, exec, s[22:23] ; GFX90A-NEXT: s_cbranch_vccz .LBB3_1 ; GFX90A-NEXT: ; %bb.11: ; %bb12 ; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll index 8e16889c72e65..92b4e3bd886d8 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll @@ -2767,63 +2767,67 @@ define amdgpu_kernel void @urem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 +; GFX6-NEXT: s_and_b32 s4, s10, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s4 +; GFX6-NEXT: v_mov_b32_e32 v4, s10 +; GFX6-NEXT: v_alignbit_b32 v4, s11, v4, 16 ; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 -; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v4 +; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s4 ; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 16 +; GFX6-NEXT: v_and_b32_e32 v6, 0xffff, v1 +; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, v6 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v1 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mad_f32 v1, -v1, v2, v4 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s6 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: s_and_b32 s5, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 -; GFX6-NEXT: v_sub_i32_e32 v5, vcc, s4, v1 -; GFX6-NEXT: s_lshr_b32 s4, s11, 16 -; GFX6-NEXT: v_mul_f32_e32 v1, v3, v4 +; GFX6-NEXT: v_mad_f32 v2, -v3, v0, v2 +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v0 +; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v2, v2 +; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v3, vcc +; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_mad_f32 v2, -v2, v5, v6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v5 +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s4 +; GFX6-NEXT: s_and_b32 s4, s9, 0xffff +; GFX6-NEXT: v_mul_lo_u32 v2, v2, v4 ; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 +; GFX6-NEXT: s_lshr_b32 s4, s11, 16 +; GFX6-NEXT: v_subrev_i32_e32 v2, vcc, v2, v1 +; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s4 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 ; GFX6-NEXT: s_lshr_b32 s5, s9, 16 ; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v5 ; GFX6-NEXT: v_trunc_f32_e32 v1, v1 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v3 +; GFX6-NEXT: v_mad_f32 v4, -v1, v3, v4 ; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, v6, v7 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v4|, v3 +; GFX6-NEXT: v_mul_f32_e32 v3, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v3, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 ; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v6 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc +; GFX6-NEXT: v_mad_f32 v3, -v3, v5, v6 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc ; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s4 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s4 +; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s5, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s5, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -3291,72 +3295,73 @@ define amdgpu_kernel void @srem_v4i16(ptr addrspace(1) %out, <4 x i16> %x, <4 x ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_ashr_i32 s5, s8, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_alignbit_b32 v2, s11, v2, 16 +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 16 +; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s8, 16 -; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 +; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: s_lshr_b32 s7, s10, 16 -; GFX6-NEXT: s_or_b32 s8, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 +; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: s_sext_i32_i16 s4, s11 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: s_sext_i32_i16 s5, s9 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v1 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s5 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 +; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v3 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_or_b32 s6, s4, 1 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_mul_f32_e32 v4, v1, v4 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX6-NEXT: v_mul_f32_e32 v4, v2, v4 ; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v1, -v4, v2, v1 +; GFX6-NEXT: v_mad_f32 v2, -v4, v3, v2 ; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v3| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v4 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 ; GFX6-NEXT: s_ashr_i32 s4, s11, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s4 ; GFX6-NEXT: s_ashr_i32 s5, s9, 16 ; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s5 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v3 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 ; GFX6-NEXT: s_lshr_b32 s6, s9, 16 ; GFX6-NEXT: s_lshr_b32 s7, s11, 16 ; GFX6-NEXT: v_mul_f32_e32 v5, v4, v5 ; GFX6-NEXT: v_trunc_f32_e32 v5, v5 -; GFX6-NEXT: v_mad_f32 v4, -v5, v2, v4 +; GFX6-NEXT: v_mad_f32 v4, -v5, v3, v4 ; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 ; GFX6-NEXT: s_or_b32 s8, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v2| +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v4|, |v3| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s8, 0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v5 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s11 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s7 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s9, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s6, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 -; GFX6-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v3 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v5 +; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s7 +; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, s6, v3 +; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v3 +; GFX6-NEXT: v_and_b32_e32 v2, 0xffff, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v2, v1 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX6-NEXT: s_endpgm ; @@ -4009,50 +4014,54 @@ define amdgpu_kernel void @urem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0xb ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 -; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s10, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v0, s5 -; GFX6-NEXT: s_lshr_b32 s5, s10, 16 -; GFX6-NEXT: s_and_b32 s4, s8, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v2, s5 +; GFX6-NEXT: s_and_b32 s4, s10, 0xffff ; GFX6-NEXT: v_cvt_f32_u32_e32 v1, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v0 -; GFX6-NEXT: s_lshr_b32 s4, s8, 16 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v2 -; GFX6-NEXT: v_mul_f32_e32 v3, v1, v3 +; GFX6-NEXT: s_and_b32 s2, s8, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, s2 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v1 +; GFX6-NEXT: v_alignbit_b32 v2, s11, v2, 16 +; GFX6-NEXT: v_and_b32_e32 v5, 0xffff, v2 +; GFX6-NEXT: v_cvt_f32_u32_e32 v5, v5 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_mov_b32_e32 v0, s8 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_alignbit_b32 v0, s9, v0, 16 +; GFX6-NEXT: v_mad_f32 v3, -v4, v1, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v4 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v1 +; GFX6-NEXT: v_and_b32_e32 v3, 0xffff, v0 +; GFX6-NEXT: v_cvt_f32_u32_e32 v3, v3 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v5 +; GFX6-NEXT: s_and_b32 s4, s11, 0xffff +; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v6, vcc +; GFX6-NEXT: v_cvt_f32_u32_e32 v6, s4 +; GFX6-NEXT: v_mul_lo_u32 v1, v1, s10 +; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 +; GFX6-NEXT: v_trunc_f32_e32 v4, v4 +; GFX6-NEXT: v_mad_f32 v3, -v4, v5, v3 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v4 +; GFX6-NEXT: s_and_b32 s4, s9, 0xffff +; GFX6-NEXT: v_cvt_f32_u32_e32 v7, s4 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v8, v6 +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s8, v1 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v5 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 +; GFX6-NEXT: v_mul_f32_e32 v3, v7, v8 ; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v1, -v3, v0, v1 -; GFX6-NEXT: v_cvt_u32_f32_e32 v6, v3 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v0 -; GFX6-NEXT: v_mul_f32_e32 v1, v4, v5 -; GFX6-NEXT: v_trunc_f32_e32 v1, v1 -; GFX6-NEXT: s_and_b32 s6, s11, 0xffff -; GFX6-NEXT: v_mad_f32 v3, -v1, v2, v4 -; GFX6-NEXT: v_cvt_f32_u32_e32 v4, s6 -; GFX6-NEXT: v_addc_u32_e32 v0, vcc, 0, v6, vcc -; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: s_and_b32 s6, s9, 0xffff -; GFX6-NEXT: v_cvt_f32_u32_e32 v5, s6 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v6, v4 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: v_cvt_u32_f32_e32 v1, v1 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v2 -; GFX6-NEXT: v_mul_f32_e32 v2, v5, v6 -; GFX6-NEXT: v_trunc_f32_e32 v2, v2 -; GFX6-NEXT: v_cvt_u32_f32_e32 v3, v2 -; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX6-NEXT: v_mad_f32 v2, -v2, v4, v5 -; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v4 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s5 -; GFX6-NEXT: v_addc_u32_e32 v2, vcc, 0, v3, vcc -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s4, v1 -; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 -; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX6-NEXT: v_cvt_u32_f32_e32 v4, v3 +; GFX6-NEXT: v_mad_f32 v3, -v3, v6, v7 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v3|, v6 +; GFX6-NEXT: s_mov_b32 s2, -1 +; GFX6-NEXT: v_addc_u32_e32 v3, vcc, 0, v4, vcc +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 +; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, v2, v0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v3 +; GFX6-NEXT: v_and_b32_e32 v1, 0xffff, v1 +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX6-NEXT: s_endpgm @@ -4421,47 +4430,48 @@ define amdgpu_kernel void @srem_v3i16(ptr addrspace(1) %out, <3 x i16> %x, <3 x ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GFX6-NEXT: s_cselect_b32 s4, s6, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s4, v2 -; GFX6-NEXT: s_ashr_i32 s4, s10, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v1, s4 -; GFX6-NEXT: s_ashr_i32 s5, s8, 16 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s5 +; GFX6-NEXT: v_mov_b32_e32 v2, s10 +; GFX6-NEXT: v_alignbit_b32 v2, s11, v2, 16 +; GFX6-NEXT: v_bfe_i32 v3, v2, 0, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, v3 +; GFX6-NEXT: v_mov_b32_e32 v1, s8 +; GFX6-NEXT: v_alignbit_b32 v1, s9, v1, 16 +; GFX6-NEXT: v_bfe_i32 v5, v1, 0, 16 +; GFX6-NEXT: v_cvt_f32_i32_e32 v6, v5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v7, v4 ; GFX6-NEXT: v_mul_lo_u32 v0, v0, s10 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v3, v1 -; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: s_lshr_b32 s6, s8, 16 -; GFX6-NEXT: v_mul_f32_e32 v3, v2, v3 -; GFX6-NEXT: v_trunc_f32_e32 v3, v3 -; GFX6-NEXT: v_mad_f32 v2, -v3, v1, v2 -; GFX6-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 -; GFX6-NEXT: s_lshr_b32 s7, s10, 16 -; GFX6-NEXT: s_or_b32 s8, s4, 1 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v2|, |v1| -; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s8, 0 -; GFX6-NEXT: v_add_i32_e32 v1, vcc, s4, v3 +; GFX6-NEXT: v_xor_b32_e32 v3, v5, v3 ; GFX6-NEXT: s_sext_i32_i16 s4, s11 -; GFX6-NEXT: v_cvt_f32_i32_e32 v2, s4 +; GFX6-NEXT: v_mul_f32_e32 v5, v6, v7 +; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: v_sub_i32_e32 v0, vcc, s8, v0 +; GFX6-NEXT: v_mad_f32 v6, -v5, v4, v6 +; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GFX6-NEXT: v_cmp_ge_f32_e64 vcc, |v6|, |v4| +; GFX6-NEXT: v_cvt_f32_i32_e32 v4, s4 +; GFX6-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v3, v5 ; GFX6-NEXT: s_sext_i32_i16 s5, s9 +; GFX6-NEXT: v_mul_lo_u32 v2, v3, v2 ; GFX6-NEXT: v_cvt_f32_i32_e32 v3, s5 +; GFX6-NEXT: v_rcp_iflag_f32_e32 v5, v4 ; GFX6-NEXT: s_xor_b32 s4, s5, s4 -; GFX6-NEXT: v_rcp_iflag_f32_e32 v4, v2 ; GFX6-NEXT: s_ashr_i32 s4, s4, 30 -; GFX6-NEXT: v_mul_lo_u32 v1, v1, s7 -; GFX6-NEXT: s_or_b32 s7, s4, 1 -; GFX6-NEXT: v_mul_f32_e32 v4, v3, v4 -; GFX6-NEXT: v_trunc_f32_e32 v4, v4 -; GFX6-NEXT: v_mad_f32 v3, -v4, v2, v3 -; GFX6-NEXT: v_cvt_i32_f32_e32 v4, v4 -; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v2| +; GFX6-NEXT: s_or_b32 s6, s4, 1 +; GFX6-NEXT: v_mul_f32_e32 v5, v3, v5 +; GFX6-NEXT: v_trunc_f32_e32 v5, v5 +; GFX6-NEXT: v_mad_f32 v3, -v5, v4, v3 +; GFX6-NEXT: v_cvt_i32_f32_e32 v5, v5 +; GFX6-NEXT: v_cmp_ge_f32_e64 s[4:5], |v3|, |v4| ; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GFX6-NEXT: s_cselect_b32 s4, s7, 0 -; GFX6-NEXT: v_add_i32_e32 v2, vcc, s4, v4 -; GFX6-NEXT: v_mul_lo_u32 v2, v2, s11 -; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 +; GFX6-NEXT: s_cselect_b32 s4, s6, 0 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, s4, v5 +; GFX6-NEXT: v_mul_lo_u32 v3, v3, s11 +; GFX6-NEXT: v_subrev_i32_e32 v1, vcc, v2, v1 ; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v2 +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, s9, v3 ; GFX6-NEXT: v_and_b32_e32 v0, 0xffff, v0 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX6-NEXT: buffer_store_short v2, off, s[0:3], 0 offset:4 diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll index 51afa79674a80..56e592506855c 100644 --- a/llvm/test/CodeGen/AMDGPU/anyext.ll +++ b/llvm/test/CodeGen/AMDGPU/anyext.ll @@ -74,8 +74,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; GCN-NEXT: v_lshlrev_b32_e32 v0, 1, v1 ; GCN-NEXT: s_mov_b64 s[6:7], s[14:15] ; GCN-NEXT: v_mov_b32_e32 v1, v3 -; GCN-NEXT: buffer_load_ushort v2, v[2:3], s[12:15], 0 addr64 -; GCN-NEXT: buffer_load_ushort v0, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: buffer_load_ubyte v2, v[2:3], s[12:15], 0 addr64 +; GCN-NEXT: buffer_load_ubyte v0, v[0:1], s[4:7], 0 addr64 ; GCN-NEXT: s_mov_b32 s10, -1 ; GCN-NEXT: s_mov_b32 s8, s0 ; GCN-NEXT: s_mov_b32 s9, s1 @@ -99,8 +99,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: flat_load_ushort v2, v[2:3] -; GFX8-NEXT: flat_load_ushort v0, v[0:1] +; GFX8-NEXT: flat_load_ubyte v2, v[2:3] +; GFX8-NEXT: flat_load_ubyte v0, v[0:1] ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt vmcnt(0) @@ -118,8 +118,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 1, v1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_ushort v2, v0, s[2:3] -; GFX9-NEXT: global_load_ushort v3, v1, s[6:7] +; GFX9-NEXT: global_load_ubyte v2, v0, s[2:3] +; GFX9-NEXT: global_load_ubyte v3, v1, s[6:7] ; GFX9-NEXT: s_mov_b32 s3, 0xf000 ; GFX9-NEXT: s_mov_b32 s2, -1 ; GFX9-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll index 0c624a83ae1be..d695fc7dfe8f3 100644 --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll @@ -41,8 +41,11 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX7-NEXT: .LBB0_4: ; %Flow ; GFX7-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX7-NEXT: s_wqm_b64 s[4:5], -1 -; GFX7-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX7-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX7-NEXT: s_waitcnt expcnt(0) +; GFX7-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX7-NEXT: v_and_b32_e32 v1, v1, v1 +; GFX7-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX7-NEXT: s_cbranch_vccnz .LBB0_6 ; GFX7-NEXT: ; %bb.5: ; %if ; GFX7-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -76,8 +79,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX89-NEXT: .LBB0_4: ; %Flow ; GFX89-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX89-NEXT: s_wqm_b64 s[4:5], -1 -; GFX89-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX89-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX89-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX89-NEXT: v_and_b32_e32 v1, v1, v1 +; GFX89-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX89-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX89-NEXT: s_cbranch_vccnz .LBB0_6 ; GFX89-NEXT: ; %bb.5: ; %if ; GFX89-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -112,8 +117,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1064-NEXT: .LBB0_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX1064-NEXT: v_and_b32_e32 v1, v1, v1 +; GFX1064-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1064-NEXT: s_cbranch_vccnz .LBB0_6 ; GFX1064-NEXT: ; %bb.5: ; %if ; GFX1064-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -147,8 +154,10 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1032-NEXT: .LBB0_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 -; GFX1032-NEXT: s_and_b32 s4, s4, s4 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX1032-NEXT: v_and_b32_e32 v1, v1, v1 +; GFX1032-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1032-NEXT: s_cbranch_vccnz .LBB0_6 ; GFX1032-NEXT: ; %bb.5: ; %if ; GFX1032-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -186,9 +195,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1164-NEXT: .LBB0_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX1164-NEXT: v_and_b32_e32 v1, v1, v1 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GFX1164-NEXT: s_cbranch_vccnz .LBB0_6 ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -225,9 +237,12 @@ define amdgpu_ps void @add_i32_constant(ptr addrspace(8) inreg %out, ptr addrspa ; GFX1132-NEXT: .LBB0_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_b32 s4, s4, s4 -; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX1132-NEXT: v_and_b32_e32 v1, v1, v1 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 ; GFX1132-NEXT: s_cbranch_vccnz .LBB0_6 ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v0, off, s[0:3], 0 @@ -307,8 +322,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX8-NEXT: .LBB1_4: ; %Flow ; GFX8-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX8-NEXT: s_wqm_b64 s[4:5], -1 -; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX8-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8-NEXT: v_and_b32_e32 v0, v0, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX8-NEXT: ; %bb.5: ; %if ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -362,8 +379,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX9-NEXT: .LBB1_4: ; %Flow ; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_wqm_b64 s[4:5], -1 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX9-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v0, v0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX9-NEXT: ; %bb.5: ; %if ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], 0 @@ -422,8 +441,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1064-NEXT: .LBB1_4: ; %Flow ; GFX1064-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1064-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1064-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1064-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX1064-NEXT: v_and_b32_e32 v0, v0, v0 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1064-NEXT: ; %bb.5: ; %if ; GFX1064-NEXT: buffer_store_dword v4, off, s[0:3], 0 @@ -472,8 +493,10 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1032-NEXT: .LBB1_4: ; %Flow ; GFX1032-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1032-NEXT: s_wqm_b32 s4, -1 -; GFX1032-NEXT: s_and_b32 s4, s4, s4 -; GFX1032-NEXT: s_andn2_b32 vcc_lo, exec_lo, s4 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1032-NEXT: v_and_b32_e32 v0, v0, v0 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1032-NEXT: ; %bb.5: ; %if ; GFX1032-NEXT: buffer_store_dword v4, off, s[0:3], 0 @@ -541,9 +564,12 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1164-NEXT: .LBB1_4: ; %Flow ; GFX1164-NEXT: s_or_b64 exec, exec, s[8:9] ; GFX1164-NEXT: s_wqm_b64 s[4:5], -1 -; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1164-NEXT: s_and_b64 s[4:5], s[4:5], s[4:5] -; GFX1164-NEXT: s_and_not1_b64 vcc, exec, s[4:5] +; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX1164-NEXT: v_and_b32_e32 v0, v0, v0 +; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1164-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1164-NEXT: ; %bb.5: ; %if ; GFX1164-NEXT: buffer_store_b32 v4, off, s[0:3], 0 @@ -598,9 +624,12 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac ; GFX1132-NEXT: .LBB1_4: ; %Flow ; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s8 ; GFX1132-NEXT: s_wqm_b32 s4, -1 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX1132-NEXT: s_and_b32 s4, s4, s4 -; GFX1132-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s4 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1132-NEXT: v_and_b32_e32 v0, v0, v0 +; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1132-NEXT: s_cbranch_vccnz .LBB1_6 ; GFX1132-NEXT: ; %bb.5: ; %if ; GFX1132-NEXT: buffer_store_b32 v4, off, s[0:3], 0 diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index a597faa028f22..87c4b0b37dbeb 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -113,37 +113,42 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-942: ; %bb.0: ; GFX-942-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 -; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-942-NEXT: s_movk_i32 s2, 0x7fff +; GFX-942-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX-942-NEXT: v_and_or_b32 v4, v6, 1, v4 +; GFX-942-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] +; GFX-942-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX-942-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 -; GFX-942-NEXT: s_movk_i32 s4, 0x7fff -; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s4 +; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s2 ; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX-942-NEXT: s_nop 1 ; GFX-942-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GFX-942-NEXT: v_cvt_f32_f64_e32 v5, v[2:3] ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 -; GFX-942-NEXT: v_and_b32_e32 v6, 1, v5 -; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[0:1]| ; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1] -; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6 -; GFX-942-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-942-NEXT: v_add_u32_e32 v0, v5, v0 -; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX-942-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[2:3]|, |v[0:1]| +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX-942-NEXT: v_and_or_b32 v0, v5, 1, v0 +; GFX-942-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[0:1] +; GFX-942-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX-942-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX-942-NEXT: s_mov_b32 s0, 0x7060302 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GFX-942-NEXT: v_bfe_u32 v1, v0, 16, 1 -; GFX-942-NEXT: v_add3_u32 v1, v1, v0, s4 +; GFX-942-NEXT: v_add3_u32 v1, v1, v0, s2 ; GFX-942-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] -; GFX-942-NEXT: s_mov_b32 s0, 0x7060302 -; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: s_nop 1 ; GFX-942-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; GFX-942-NEXT: v_perm_b32 v0, v0, v4, s0 ; GFX-942-NEXT: ; return to shader part epilog @@ -152,24 +157,27 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950: ; %bb.0: ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3] ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]| ; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 -; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[2:3]|, |v[4:5]| ; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1] -; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc +; GFX-950-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX-950-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX-950-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] +; GFX-950-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX-950-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 -; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5 -; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]| +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[2:3]| +; GFX-950-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6 -; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0 -; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX-950-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[0:1] +; GFX-950-NEXT: v_add_u32_e32 v6, v5, v6 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX-950-NEXT: v_or_b32_e32 v0, v0, v5 +; GFX-950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 ; GFX-950-NEXT: ; return to shader part epilog %res = fptrunc <2 x double> %src to <2 x bfloat> @@ -340,16 +348,19 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; GFX-942: ; %bb.0: ; %entry ; GFX-942-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; GFX-942-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX-942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 -; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX-942-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX-942-NEXT: v_and_or_b32 v4, v6, 1, v4 +; GFX-942-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] +; GFX-942-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX-942-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc +; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0 ; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX-942-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] @@ -362,14 +373,17 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; GFX-950: ; %bb.0: ; %entry ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| ; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 -; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0 -; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-950-NEXT: s_nop 0 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX-950-NEXT: v_and_or_b32 v0, v6, 1, v0 +; GFX-950-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] +; GFX-950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX-950-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm @@ -384,16 +398,20 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; GFX-942: ; %bb.0: ; %entry ; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1] ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], -v[0:1], v[4:5] -; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX-942-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX-942-NEXT: v_and_or_b32 v4, v6, 1, v4 +; GFX-942-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc +; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0 ; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1] @@ -406,14 +424,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; GFX-950: ; %bb.0: ; %entry ; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, -v[0:1] ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] ; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], -v[0:1], v[4:5] -; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX-950-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-950-NEXT: s_nop 0 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX-950-NEXT: v_and_or_b32 v0, v6, 1, v0 +; GFX-950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm @@ -429,16 +451,20 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; GFX-942: ; %bb.0: ; %entry ; GFX-942-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX-942-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-942-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-942-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX-942-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-942-NEXT: s_nop 1 +; GFX-942-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] ; GFX-942-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX-942-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3] -; GFX-942-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX-942-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-942-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 +; GFX-942-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; GFX-942-NEXT: v_and_or_b32 v4, v6, 1, v4 +; GFX-942-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX-942-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GFX-942-NEXT: s_movk_i32 s0, 0x7fff +; GFX-942-NEXT: s_nop 0 +; GFX-942-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc +; GFX-942-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX-942-NEXT: v_add3_u32 v5, v5, v4, s0 ; GFX-942-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; GFX-942-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]| @@ -451,14 +477,18 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; GFX-950: ; %bb.0: ; %entry ; GFX-950-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]| ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[4:5]| +; GFX-950-NEXT: v_cmp_gt_f64_e64 s[0:1], |v[0:1]|, |v[4:5]| +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[0:1] ; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5] -; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 -; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v0, v6, v0 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; GFX-950-NEXT: v_add_u32_e32 v7, v6, v7 +; GFX-950-NEXT: s_nop 0 +; GFX-950-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX-950-NEXT: v_and_or_b32 v0, v6, 1, v0 +; GFX-950-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX-950-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX-950-NEXT: s_nop 1 +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 ; GFX-950-NEXT: flat_store_short v[2:3], v0 ; GFX-950-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll index c4957fd44e2be..0b2e25d6aebb8 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -2290,13 +2290,15 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX8-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX8-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 ; GFX8-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; GFX8-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| ; GFX8-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX8-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX8-NEXT: v_add_u32_e64 v4, s[6:7], v6, v4 -; GFX8-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX8-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[4:5]| +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[4:5] +; GFX8-NEXT: v_or_b32_e32 v4, v4, v7 +; GFX8-NEXT: v_add_u32_e32 v5, vcc, v6, v8 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX8-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX8-NEXT: v_or_b32_e32 v6, 0x400000, v4 ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v5, v4 @@ -2312,22 +2314,23 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: s_movk_i32 s8, 0x7fff +; GFX9-NEXT: s_movk_i32 s6, 0x7fff ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX9-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX9-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; GFX9-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| ; GFX9-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; GFX9-NEXT: v_add_u32_e32 v4, v6, v4 -; GFX9-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; GFX9-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[4:5]| +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX9-NEXT: v_and_or_b32 v4, v6, 1, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX9-NEXT: v_add_u32_e32 v5, v6, v7 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v6, vcc ; GFX9-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; GFX9-NEXT: v_bfe_u32 v5, v4, 16, 1 ; GFX9-NEXT: v_or_b32_e32 v6, 0x400000, v4 -; GFX9-NEXT: v_add3_u32 v4, v5, v4, s8 +; GFX9-NEXT: v_add3_u32 v4, v5, v4, s6 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc ; GFX9-NEXT: global_store_short_d16_hi v[2:3], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -2340,13 +2343,14 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; GFX10-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX10-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX10-NEXT: v_cmp_gt_f64_e64 s4, |v[0:1]|, |v[4:5]| ; GFX10-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5] +; GFX10-NEXT: v_cmp_gt_f64_e64 s4, |v[0:1]|, |v[4:5]| +; GFX10-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v4, -1, 1, s4 -; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 1, v7 +; GFX10-NEXT: v_and_or_b32 v7, v6, 1, v7 ; GFX10-NEXT: v_add_nc_u32_e32 v4, v6, v4 -; GFX10-NEXT: s_or_b32 vcc_lo, vcc_lo, s4 +; GFX10-NEXT: v_and_b32_e32 v5, 1, v7 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX10-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX10-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] ; GFX10-NEXT: v_bfe_u32 v5, v4, 16, 1 @@ -2362,17 +2366,19 @@ define void @test_load_store_f64_to_bf16(ptr addrspace(1) %in, ptr addrspace(1) ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX11-NEXT: v_and_b32_e32 v7, 1, v6 -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]| ; GFX11-NEXT: v_cmp_nlg_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_cmp_gt_f64_e64 s0, |v[0:1]|, |v[4:5]| +; GFX11-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e64 v4, -1, 1, s0 -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 1, v7 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_or_b32 v7, v6, 1, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_add_nc_u32_e32 v4, v6, v4 -; GFX11-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 +; GFX11-NEXT: v_and_b32_e32 v5, 1, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v5 ; GFX11-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc_lo ; GFX11-NEXT: v_cmp_u_f64_e32 vcc_lo, v[0:1], v[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll index 2bf4a2c028fdc..95c5a7e11dfb5 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-condition-and.ll @@ -13,10 +13,14 @@ define amdgpu_ps void @ham(float %arg, float %arg1) #0 { ; GCN-LABEL: ham: ; GCN: ; %bb.0: ; %bb +; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GCN-NEXT: v_cmp_lt_f32_e32 vcc, 0, v0 -; GCN-NEXT: v_cmp_lt_f32_e64 s[0:1], 0, v1 -; GCN-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GCN-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GCN-NEXT: ; %bb.1: ; %bb4 ; GCN-NEXT: v_mov_b32_e32 v0, 4 ; GCN-NEXT: s_mov_b32 m0, -1 diff --git a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll index b03ade4b527e6..5fc8153eb1d1e 100644 --- a/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll +++ b/llvm/test/CodeGen/AMDGPU/branch-relaxation.ll @@ -1272,11 +1272,11 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s3, 6 ; GCN-NEXT: s_cbranch_scc1 .LBB10_1 -; GCN-NEXT: ; %bb.8: ; %bb +; GCN-NEXT: ; %bb.10: ; %bb ; GCN-NEXT: s_getpc_b64 s[8:9] -; GCN-NEXT: .Lpost_getpc12: -; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 -; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 +; GCN-NEXT: .Lpost_getpc13: +; GCN-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc13)&4294967295 +; GCN-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc13)>>32 ; GCN-NEXT: s_setpc_b64 s[8:9] ; GCN-NEXT: .LBB10_1: ; %bb13 ; GCN-NEXT: ;;#ASMSTART @@ -1286,32 +1286,44 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GCN-NEXT: v_nop_e64 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_cbranch_execz .LBB10_3 -; GCN-NEXT: s_branch .LBB10_4 +; GCN-NEXT: ; %bb.8: ; %bb13 +; GCN-NEXT: s_getpc_b64 s[10:11] +; GCN-NEXT: .Lpost_getpc12: +; GCN-NEXT: s_add_u32 s10, s10, (.LBB10_4-.Lpost_getpc12)&4294967295 +; GCN-NEXT: s_addc_u32 s11, s11, (.LBB10_4-.Lpost_getpc12)>>32 +; GCN-NEXT: s_setpc_b64 s[10:11] ; GCN-NEXT: .LBB10_2: ; GCN-NEXT: s_mov_b64 s[8:9], 0 ; GCN-NEXT: .LBB10_3: ; %bb9 ; GCN-NEXT: s_cmp_lt_i32 s3, 11 ; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GCN-NEXT: s_cmp_ge_i32 s2, s3 -; GCN-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GCN-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-NEXT: v_and_b32_e32 v0, v1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GCN-NEXT: .LBB10_4: ; %Flow5 ; GCN-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GCN-NEXT: s_cbranch_vccz .LBB10_5 -; GCN-NEXT: ; %bb.10: ; %Flow5 +; GCN-NEXT: ; %bb.12: ; %Flow5 ; GCN-NEXT: s_getpc_b64 s[0:1] -; GCN-NEXT: .Lpost_getpc13: -; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295 -; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 +; GCN-NEXT: .Lpost_getpc14: +; GCN-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc14)&4294967295 +; GCN-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc14)>>32 ; GCN-NEXT: s_setpc_b64 s[0:1] ; GCN-NEXT: .LBB10_5: ; %bb14 ; GCN-NEXT: s_cmp_lt_i32 s1, 9 ; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GCN-NEXT: s_cmp_lt_i32 s2, s3 -; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GCN-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] ; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GCN-NEXT: v_and_b32_e32 v0, v1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_branch .LBB10_7 ; GCN-NEXT: .LBB10_6: ; GCN-NEXT: ; implicit-def: $vgpr0 @@ -1337,12 +1349,12 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX11-NEXT: s_cmp_lt_i32 s3, 6 ; GFX11-NEXT: s_cbranch_scc1 .LBB10_1 -; GFX11-NEXT: ; %bb.8: ; %bb +; GFX11-NEXT: ; %bb.10: ; %bb ; GFX11-NEXT: s_getpc_b64 s[8:9] -; GFX11-NEXT: .Lpost_getpc11: +; GFX11-NEXT: .Lpost_getpc12: ; GFX11-NEXT: s_waitcnt_depctr 0xfffe -; GFX11-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc11)&4294967295 -; GFX11-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc11)>>32 +; GFX11-NEXT: s_add_u32 s8, s8, (.LBB10_2-.Lpost_getpc12)&4294967295 +; GFX11-NEXT: s_addc_u32 s9, s9, (.LBB10_2-.Lpost_getpc12)>>32 ; GFX11-NEXT: s_waitcnt_depctr 0xfffe ; GFX11-NEXT: s_setpc_b64 s[8:9] ; GFX11-NEXT: .LBB10_1: ; %bb13 @@ -1353,30 +1365,55 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: v_nop_e64 ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_cbranch_execz .LBB10_3 -; GFX11-NEXT: s_branch .LBB10_4 +; GFX11-NEXT: ; %bb.8: ; %bb13 +; GFX11-NEXT: s_getpc_b64 s[10:11] +; GFX11-NEXT: .Lpost_getpc11: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s10, s10, (.LBB10_4-.Lpost_getpc11)&4294967295 +; GFX11-NEXT: s_addc_u32 s11, s11, (.LBB10_4-.Lpost_getpc11)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[10:11] ; GFX11-NEXT: .LBB10_2: ; GFX11-NEXT: s_mov_b64 s[8:9], 0 ; GFX11-NEXT: .LBB10_3: ; %bb9 ; GFX11-NEXT: s_cmp_lt_i32 s3, 11 ; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 ; GFX11-NEXT: s_cmp_ge_i32 s2, s3 -; GFX11-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b64 s[8:9], s[10:11], s[8:9] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GFX11-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v0 ; GFX11-NEXT: .LBB10_4: ; %Flow5 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: s_and_not1_b64 vcc, exec, s[8:9] -; GFX11-NEXT: s_cbranch_vccnz .LBB10_6 -; GFX11-NEXT: ; %bb.5: ; %bb14 +; GFX11-NEXT: s_cbranch_vccz .LBB10_5 +; GFX11-NEXT: ; %bb.12: ; %Flow5 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: .Lpost_getpc13: +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_add_u32 s0, s0, (.LBB10_6-.Lpost_getpc13)&4294967295 +; GFX11-NEXT: s_addc_u32 s1, s1, (.LBB10_6-.Lpost_getpc13)>>32 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_setpc_b64 s[0:1] +; GFX11-NEXT: .LBB10_5: ; %bb14 ; GFX11-NEXT: s_cmp_lt_i32 s1, 9 ; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX11-NEXT: s_cmp_lt_i32 s2, s3 -; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; GFX11-NEXT: s_and_b64 s[0:1], s[6:7], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX11-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GFX11-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_branch .LBB10_7 ; GFX11-NEXT: .LBB10_6: ; GFX11-NEXT: ; implicit-def: $vgpr0 @@ -1387,7 +1424,7 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_lshl_b64 s[0:1], s[0:1], 2 -; GFX11-NEXT: s_waitcnt_depctr 0xfffe +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_u32 s0, s2, s0 ; GFX11-NEXT: s_addc_u32 s1, s3, s1 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -1405,46 +1442,51 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: s_cselect_b32 s8, -1, 0 ; GFX12-NEXT: s_cmp_lt_i32 s3, 6 ; GFX12-NEXT: s_cbranch_scc0 .LBB10_1 -; GFX12-NEXT: ; %bb.18: ; %bb +; GFX12-NEXT: ; %bb.10: ; %bb ; GFX12-NEXT: s_getpc_b64 s[10:11] -; GFX12-NEXT: .Lpost_getpc17: +; GFX12-NEXT: .Lpost_getpc13: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc17)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc17)>>32 +; GFX12-NEXT: s_add_co_u32 s10, s10, (.LBB10_4-.Lpost_getpc13)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s11, s11, (.LBB10_4-.Lpost_getpc13)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[10:11] ; GFX12-NEXT: .LBB10_1: ; %Flow ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s7 ; GFX12-NEXT: s_cbranch_vccnz .LBB10_2 -; GFX12-NEXT: ; %bb.10: ; %Flow +; GFX12-NEXT: ; %bb.12: ; %Flow ; GFX12-NEXT: s_getpc_b64 s[8:9] -; GFX12-NEXT: .Lpost_getpc13: +; GFX12-NEXT: .Lpost_getpc14: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc13)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc13)>>32 +; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_5-.Lpost_getpc14)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_5-.Lpost_getpc14)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[8:9] ; GFX12-NEXT: .LBB10_2: ; %Flow5 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_vccz .LBB10_3 -; GFX12-NEXT: ; %bb.12: ; %Flow5 +; GFX12-NEXT: ; %bb.14: ; %Flow5 ; GFX12-NEXT: s_getpc_b64 s[0:1] -; GFX12-NEXT: .Lpost_getpc14: +; GFX12-NEXT: .Lpost_getpc15: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB10_6-.Lpost_getpc14)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB10_6-.Lpost_getpc14)>>32 +; GFX12-NEXT: s_add_co_u32 s0, s0, (.LBB10_6-.Lpost_getpc15)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s1, s1, (.LBB10_6-.Lpost_getpc15)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[0:1] ; GFX12-NEXT: .LBB10_3: ; %bb14 ; GFX12-NEXT: s_cmp_lt_i32 s1, 9 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: s_cmp_lt_i32 s2, s3 -; GFX12-NEXT: s_cselect_b32 s1, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_or_b32 s0, s1, s0 -; GFX12-NEXT: s_and_b32 s0, s6, s0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX12-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: ; %bb.8: ; %bb14 ; GFX12-NEXT: s_getpc_b64 s[0:1] ; GFX12-NEXT: .Lpost_getpc12: @@ -1462,29 +1504,35 @@ define amdgpu_kernel void @long_branch_hang(ptr addrspace(1) nocapture %arg, i32 ; GFX12-NEXT: v_nop_e64 ; GFX12-NEXT: ;;#ASMEND ; GFX12-NEXT: s_cbranch_execz .LBB10_5 -; GFX12-NEXT: ; %bb.14: ; %bb13 +; GFX12-NEXT: ; %bb.16: ; %bb13 ; GFX12-NEXT: s_getpc_b64 s[8:9] -; GFX12-NEXT: .Lpost_getpc15: +; GFX12-NEXT: .Lpost_getpc16: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc15)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc15)>>32 +; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_2-.Lpost_getpc16)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_2-.Lpost_getpc16)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[8:9] ; GFX12-NEXT: .LBB10_5: ; %bb9 ; GFX12-NEXT: s_cmp_lt_i32 s3, 11 ; GFX12-NEXT: s_cselect_b32 s0, -1, 0 ; GFX12-NEXT: s_cmp_ge_i32 s2, s3 -; GFX12-NEXT: s_cselect_b32 s7, -1, 0 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, s7, s0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-NEXT: s_cselect_b32 s0, -1, 0 +; GFX12-NEXT: s_wait_alu 0xfffe +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; GFX12-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 ; GFX12-NEXT: s_cbranch_vccnz .LBB10_6 -; GFX12-NEXT: ; %bb.16: ; %bb9 +; GFX12-NEXT: ; %bb.18: ; %bb9 ; GFX12-NEXT: s_getpc_b64 s[8:9] -; GFX12-NEXT: .Lpost_getpc16: +; GFX12-NEXT: .Lpost_getpc17: ; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc16)&4294967295 -; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc16)>>32 +; GFX12-NEXT: s_add_co_u32 s8, s8, (.LBB10_3-.Lpost_getpc17)&4294967295 +; GFX12-NEXT: s_add_co_ci_u32 s9, s9, (.LBB10_3-.Lpost_getpc17)>>32 ; GFX12-NEXT: s_wait_alu 0xfffe ; GFX12-NEXT: s_setpc_b64 s[8:9] ; GFX12-NEXT: .LBB10_6: diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll index f78cb0daee5c9..1414af0271380 100644 --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -368,13 +368,13 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: s_mov_b32 m0, -1 ; GCN-NEXT: ds_read_u8 v0, v0 -; GCN-NEXT: s_load_dword s4, s[4:5], 0x9 +; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s0, v0 -; GCN-NEXT: s_bitcmp1_b32 s0, 0 -; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GCN-NEXT: s_add_i32 s0, s4, 0x80 +; GCN-NEXT: v_readfirstlane_b32 s1, v0 +; GCN-NEXT: s_not_b32 s1, s1 +; GCN-NEXT: s_bitcmp1_b32 s1, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: s_addk_i32 s0, 0x80 ; GCN-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN-NEXT: .LBB4_1: ; %for.body ; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 @@ -406,14 +406,13 @@ define amdgpu_kernel void @loop_arg_0(ptr addrspace(3) %ptr, i32 %n) nounwind { ; GCN_DBG-NEXT: ds_read_u8 v0, v0 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_readfirstlane_b32 s0, v0 -; GCN_DBG-NEXT: s_and_b32 s0, 1, s0 -; GCN_DBG-NEXT: s_cmp_eq_u32 s0, 1 -; GCN_DBG-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 -; GCN_DBG-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 1 -; GCN_DBG-NEXT: v_writelane_b32 v2, s1, 2 +; GCN_DBG-NEXT: s_not_b32 s0, s0 +; GCN_DBG-NEXT: s_and_b32 s1, 1, s0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 +; GCN_DBG-NEXT: s_cmp_eq_u32 s1, 1 +; GCN_DBG-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN_DBG-NEXT: v_writelane_b32 v2, s2, 1 +; GCN_DBG-NEXT: v_writelane_b32 v2, s3, 2 ; GCN_DBG-NEXT: v_writelane_b32 v2, s0, 3 ; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN_DBG-NEXT: buffer_store_dword v2, off, s[12:15], 0 ; 4-byte Folded Spill diff --git a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll index d07cc84865bea..6afadf9a0570b 100644 --- a/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll +++ b/llvm/test/CodeGen/AMDGPU/coalescer_distribute.ll @@ -9,15 +9,16 @@ define amdgpu_kernel void @hoge(i1 %c0, i1 %c1, i1 %c2, i1 %c3, i1 %c4) { ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_load_dword s2, s[4:5], 0x9 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_bitcmp1_b32 s2, 0 -; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_and_b64 s[4:5], s[0:1], vcc -; CHECK-NEXT: s_and_saveexec_b64 s[0:1], s[4:5] +; CHECK-NEXT: v_and_b32_e32 v0, s2, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: s_and_saveexec_b64 s[0:1], vcc ; CHECK-NEXT: s_or_b64 exec, exec, s[0:1] -; CHECK-NEXT: s_bitcmp1_b32 s2, 24 +; CHECK-NEXT: s_not_b32 s0, s2 +; CHECK-NEXT: s_bitcmp1_b32 s0, 24 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_xor_b64 s[0:1], s[0:1], -1 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0 ; CHECK-NEXT: .LBB0_1: ; %bb25 diff --git a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll index 31c23b94a8de8..e9a4230ee2c57 100644 --- a/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll +++ b/llvm/test/CodeGen/AMDGPU/codegen-prepare-addrspacecast-non-null.ll @@ -322,23 +322,25 @@ define void @recursive_phis(i1 %cond, ptr addrspace(5) %ptr) { ; DAGISEL-ASM-LABEL: recursive_phis: ; DAGISEL-ASM: ; %bb.0: ; %entry ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; DAGISEL-ASM-NEXT: v_and_b32_e32 v2, 1, v0 +; DAGISEL-ASM-NEXT: v_not_b32_e32 v0, v0 ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 1, v0 +; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 ; DAGISEL-ASM-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; DAGISEL-ASM-NEXT: v_lshrrev_b32_e64 v0, 6, s32 -; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[4:5], vcc +; DAGISEL-ASM-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; DAGISEL-ASM-NEXT: ; %bb.1: ; %then ; DAGISEL-ASM-NEXT: v_and_b32_e32 v0, 0xffff, v1 ; DAGISEL-ASM-NEXT: ; %bb.2: ; %finallyendcf.split -; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[4:5] -; DAGISEL-ASM-NEXT: s_xor_b64 s[6:7], vcc, -1 +; DAGISEL-ASM-NEXT: s_or_b64 exec, exec, s[6:7] ; DAGISEL-ASM-NEXT: s_mov_b64 s[4:5], 0 -; DAGISEL-ASM-NEXT: s_mov_b64 s[8:9], src_private_base +; DAGISEL-ASM-NEXT: s_mov_b64 s[6:7], src_private_base ; DAGISEL-ASM-NEXT: v_mov_b32_e32 v2, 7 ; DAGISEL-ASM-NEXT: .LBB11_3: ; %finally ; DAGISEL-ASM-NEXT: ; =>This Inner Loop Header: Depth=1 -; DAGISEL-ASM-NEXT: s_and_b64 s[10:11], exec, s[6:7] -; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] -; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s9 +; DAGISEL-ASM-NEXT: s_and_b64 s[8:9], exec, vcc +; DAGISEL-ASM-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5] +; DAGISEL-ASM-NEXT: v_mov_b32_e32 v1, s7 ; DAGISEL-ASM-NEXT: flat_store_dword v[0:1], v2 ; DAGISEL-ASM-NEXT: s_waitcnt vmcnt(0) ; DAGISEL-ASM-NEXT: s_andn2_b64 exec, exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll index f0e7cba6924d8..3477cc664a294 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-add-zext-xor.ll @@ -16,9 +16,10 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1010-NEXT: s_branch .LBB0_2 ; GFX1010-NEXT: .LBB0_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX1010-NEXT: s_xor_b32 s5, s5, -1 -; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 +; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 +; GFX1010-NEXT: v_not_b32_e32 v0, v0 +; GFX1010-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1010-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX1010-NEXT: v_mov_b32_e32 v1, v2 ; GFX1010-NEXT: s_cbranch_vccz .LBB0_4 @@ -46,11 +47,14 @@ define i32 @combine_add_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB0_2 +; GFX1100-NEXT: .p2align 6 ; GFX1100-NEXT: .LBB0_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB0_2 Depth=1 -; GFX1100-NEXT: s_xor_b32 s1, s1, -1 -; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_not_b32_e32 v0, v0 +; GFX1100-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_add_nc_u32_e32 v2, v1, v0 ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 @@ -107,9 +111,10 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1010-NEXT: s_branch .LBB1_2 ; GFX1010-NEXT: .LBB1_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX1010-NEXT: s_xor_b32 s5, s5, -1 -; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 +; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 +; GFX1010-NEXT: v_not_b32_e32 v0, v0 +; GFX1010-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1010-NEXT: v_sub_nc_u32_e32 v2, v1, v0 ; GFX1010-NEXT: v_mov_b32_e32 v1, v2 ; GFX1010-NEXT: s_cbranch_vccz .LBB1_4 @@ -137,11 +142,14 @@ define i32 @combine_sub_zext_xor(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB1_2 +; GFX1100-NEXT: .p2align 6 ; GFX1100-NEXT: .LBB1_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB1_2 Depth=1 -; GFX1100-NEXT: s_xor_b32 s1, s1, -1 -; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_not_b32_e32 v0, v0 +; GFX1100-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_sub_nc_u32_e32 v2, v1, v0 ; GFX1100-NEXT: v_mov_b32_e32 v1, v2 @@ -216,8 +224,10 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1010-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 ; GFX1010-NEXT: s_branch .LBB2_1 ; GFX1010-NEXT: .LBB2_4: ; %.exit -; GFX1010-NEXT: s_or_b32 s4, s6, s7 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s7 +; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX1010-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1010-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: combine_add_zext_or: @@ -250,9 +260,11 @@ define i32 @combine_add_zext_or(i32 inreg %cond) { ; GFX1100-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 ; GFX1100-NEXT: s_branch .LBB2_1 ; GFX1100-NEXT: .LBB2_4: ; %.exit -; GFX1100-NEXT: s_or_b32 s0, s2, s3 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1100-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1100-NEXT: s_setpc_b64 s[30:31] .entry: br label %.a @@ -312,8 +324,10 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1010-NEXT: v_cmp_eq_u32_e64 s6, 0, v0 ; GFX1010-NEXT: s_branch .LBB3_1 ; GFX1010-NEXT: .LBB3_4: ; %.exit -; GFX1010-NEXT: s_or_b32 s4, s6, s7 -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s7 +; GFX1010-NEXT: v_cndmask_b32_e64 v1, 0, 1, s6 +; GFX1010-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1010-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1010-NEXT: s_setpc_b64 s[30:31] ; ; GFX1100-LABEL: combine_sub_zext_or: @@ -346,9 +360,11 @@ define i32 @combine_sub_zext_or(i32 inreg %cond) { ; GFX1100-NEXT: v_cmp_eq_u32_e64 s2, 0, v0 ; GFX1100-NEXT: s_branch .LBB3_1 ; GFX1100-NEXT: .LBB3_4: ; %.exit -; GFX1100-NEXT: s_or_b32 s0, s2, s3 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1100-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1100-NEXT: s_setpc_b64 s[30:31] .entry: br label %.a @@ -391,8 +407,10 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1010-NEXT: .LBB4_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB4_2 Depth=1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1010-NEXT: s_and_b32 s5, s5, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 +; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1010-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX1010-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1010-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX1010-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1010-NEXT: .LBB4_2: ; %.a @@ -419,13 +437,16 @@ define i32 @combine_add_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB4_2 +; GFX1100-NEXT: .p2align 6 ; GFX1100-NEXT: .LBB4_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB4_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1100-NEXT: s_and_b32 s1, s1, vcc_lo -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1100-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1100-NEXT: v_add_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB4_4 ; GFX1100-NEXT: .LBB4_2: ; %.a @@ -482,8 +503,10 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1010-NEXT: .LBB5_1: ; %bb9 ; GFX1010-NEXT: ; in Loop: Header=BB5_2 Depth=1 ; GFX1010-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1010-NEXT: s_and_b32 s5, s5, vcc_lo -; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, s5 +; GFX1010-NEXT: v_cndmask_b32_e64 v2, 0, 1, s5 +; GFX1010-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1010-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX1010-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1010-NEXT: v_sub_nc_u32_e32 v1, v1, v0 ; GFX1010-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1010-NEXT: .LBB5_2: ; %.a @@ -510,13 +533,16 @@ define i32 @combine_sub_zext_and(i32 inreg %cond) { ; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 ; GFX1100-NEXT: v_cmp_ne_u32_e64 s0, 1, v0 ; GFX1100-NEXT: s_branch .LBB5_2 +; GFX1100-NEXT: .p2align 6 ; GFX1100-NEXT: .LBB5_1: ; %bb9 ; GFX1100-NEXT: ; in Loop: Header=BB5_2 Depth=1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) ; GFX1100-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0xfffffbe6, v1 -; GFX1100-NEXT: s_and_b32 s1, s1, vcc_lo -; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 -; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1100-NEXT: v_and_b32_e32 v0, v2, v0 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX1100-NEXT: v_sub_nc_u32_e32 v1, v1, v0 ; GFX1100-NEXT: s_cbranch_vccz .LBB5_4 ; GFX1100-NEXT: .LBB5_2: ; %.a diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll index 1778fa42fbf7e..bb2eda5d549ec 100644 --- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll +++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll @@ -70,8 +70,9 @@ define i16 @add1_i16(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture ; GFX9-NEXT: global_load_dword v0, v[0:1], off ; GFX9-NEXT: v_bfe_u32 v1, v31, 10, 10 ; GFX9-NEXT: v_cmp_gt_u32_e32 vcc, v2, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_addc_co_u32_e32 v0, vcc, 0, v0, vcc +; GFX9-NEXT: v_add_u16_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: %x = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll index 1d20218440f6a..dbec39fd0839c 100644 --- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll +++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll @@ -1808,12 +1808,13 @@ define i1 @test93(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %C) { ; GCN-LABEL: test93: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_u32_e32 v2, v2, v3 ; GCN-NEXT: v_min_u32_e32 v0, v0, v1 -; GCN-NEXT: v_max_u32_e32 v1, v2, v3 +; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v2, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v4 -; GCN-NEXT: v_cmp_gt_u32_e64 s0, v1, v4 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %C %cmp2 = icmp ult i32 %arg2, %C @@ -2019,9 +2020,10 @@ define i1 @test103(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a ; GCN-NEXT: v_max_u32_e32 v2, v2, v3 ; GCN-NEXT: v_maxmin_u32 v0, v0, v1, v4 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v2, v6 -; GCN-NEXT: v_cmp_lt_u32_e64 s0, v0, v6 -; GCN-NEXT: s_or_b32 s0, s0, vcc_lo -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v6 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %C %cmp2 = icmp ult i32 %arg2, %C @@ -2044,17 +2046,19 @@ define i1 @test104(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_min_u32_e32 v8, v8, v9 ; GCN-NEXT: v_max_u32_e32 v2, v2, v3 -; GCN-NEXT: v_min_u32_e32 v3, v4, v5 -; GCN-NEXT: v_max_u32_e32 v4, v6, v7 +; GCN-NEXT: v_max_u32_e32 v3, v6, v7 ; GCN-NEXT: v_min3_u32 v0, v0, v1, v8 ; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v2, v10 -; GCN-NEXT: v_cmp_lt_u32_e64 s0, v3, v10 -; GCN-NEXT: v_cmp_gt_u32_e64 s1, v4, v10 -; GCN-NEXT: v_cmp_lt_u32_e64 s2, v0, v10 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_or_b32 s1, s2, vcc_lo -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_min_u32_e32 v1, v4, v5 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v10 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v10 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v10 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-NEXT: v_or3_b32 v0, v1, v3, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %C %cmp2 = icmp ult i32 %arg2, %C @@ -2082,18 +2086,21 @@ define i1 @test105(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a ; GCN-LABEL: test105: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_max_u32_e32 v2, v2, v3 ; GCN-NEXT: v_max_u32_e32 v0, v0, v1 -; GCN-NEXT: v_max_u32_e32 v1, v2, v3 -; GCN-NEXT: v_max_u32_e32 v2, v4, v5 +; GCN-NEXT: v_max_u32_e32 v1, v4, v5 ; GCN-NEXT: v_max_u32_e32 v3, v6, v7 +; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v2, v10 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v10 -; GCN-NEXT: v_cmp_gt_u32_e64 s0, v1, v10 -; GCN-NEXT: v_cmp_lt_u32_e64 s1, v2, v10 -; GCN-NEXT: v_cmp_gt_u32_e64 s2, v3, v10 -; GCN-NEXT: s_and_b32 s0, vcc_lo, s0 -; GCN-NEXT: s_or_b32 s1, s2, s1 -; GCN-NEXT: s_and_b32 s0, s0, s1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v10 +; GCN-NEXT: v_and_b32_e32 v0, v0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v3, v10 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v1, v3, v1 +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %C %cmp2 = icmp ult i32 %arg2, %C @@ -2117,20 +2124,22 @@ define i1 @test106(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4, i32 %arg5, i32 %a ; GCN-LABEL: test106: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_min_u32_e32 v6, v6, v7 -; GCN-NEXT: v_min_u32_e32 v0, v0, v1 -; GCN-NEXT: v_min_u32_e32 v1, v10, v11 ; GCN-NEXT: v_min_u32_e32 v2, v2, v3 -; GCN-NEXT: v_min3_u32 v3, v4, v5, v6 +; GCN-NEXT: v_min_u32_e32 v0, v0, v1 +; GCN-NEXT: v_min_u32_e32 v1, v6, v7 +; GCN-NEXT: v_min_u32_e32 v3, v10, v11 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v2, v13 +; GCN-NEXT: v_min3_u32 v1, v4, v5, v1 +; GCN-NEXT: v_min3_u32 v3, v8, v9, v3 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v12 -; GCN-NEXT: v_min3_u32 v0, v8, v9, v1 -; GCN-NEXT: v_cmp_lt_u32_e64 s0, v2, v13 -; GCN-NEXT: v_cmp_lt_u32_e64 s1, v3, v13 -; GCN-NEXT: v_cmp_lt_u32_e64 s2, v0, v12 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: s_or_b32 s0, s2, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v13 +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v3, v12 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GCN-NEXT: v_or3_b32 v0, v0, v1, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %C1 %cmp2 = icmp ult i32 %arg2, %C1 @@ -2202,23 +2211,27 @@ define i1 @test109(float %arg1, float %arg2, float %arg3, float %arg4, float %C) ; GFX11-LABEL: test109: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 ; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v1, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test109: ; GFX11NONANS: ; %bb.0: ; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11NONANS-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 +; GFX11NONANS-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11NONANS-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11NONANS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v4 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11NONANS-NEXT: v_cmp_gt_f32_e64 s0, v1, v4 -; GFX11NONANS-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11NONANS-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C %cmp2 = fcmp olt float %arg2, %C @@ -2234,13 +2247,15 @@ define i1 @test110(float %arg1, float %arg2, float %arg3, float %arg4, float %C1 ; GCN-LABEL: test110: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 ; GCN-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v3, v3, v7 -; GCN-NEXT: v_dual_max_f32 v0, v0, v1 :: v_dual_min_f32 v1, v2, v3 +; GCN-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v1, v1, v5 +; GCN-NEXT: v_min_f32_e32 v2, v2, v3 +; GCN-NEXT: v_max_f32_e32 v0, v0, v1 +; GCN-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v8 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 -; GCN-NEXT: v_cmp_gt_f32_e64 s0, v1, v8 -; GCN-NEXT: s_and_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %add1 = fadd nnan float %arg1, %C1 %add2 = fadd nnan float %arg2, %C2 @@ -2306,10 +2321,11 @@ define i1 @test112(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_min_f32 v2, v2, v3 ; GFX11-NEXT: v_max_f32_e32 v3, v6, v6 ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_min3_f32 v0, v0, v5, v3 -; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v0, v8 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test112: @@ -2347,9 +2363,10 @@ define i1 @test113(float %arg1, float %arg2, float %arg3, float %C) { ; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v0, v3 -; GFX11-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test113: @@ -2374,9 +2391,10 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) { ; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v2, v3 ; GFX11-NEXT: v_max_f32_e32 v0, v0, v1 -; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v0, v3 -; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test114: @@ -2384,9 +2402,10 @@ define i1 @test114(float %arg1, float %arg2, float %arg3, float %C) { ; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11NONANS-NEXT: v_max_f32_e32 v0, v0, v1 ; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v3 -; GFX11NONANS-NEXT: v_cmp_gt_f32_e64 s0, v0, v3 -; GFX11NONANS-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v0, v3 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11NONANS-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp ogt float %arg1, %C %cmp2 = fcmp ogt float %arg2, %C @@ -2400,13 +2419,15 @@ define i1 @test115(float %arg1, float %arg2, float %arg3, float %arg4, float %C) ; GFX11-LABEL: test115: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v1, v1, v1 -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v3, v3, v3 -; GFX11-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_max_f32 v1, v2, v3 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v2 :: v_dual_max_f32 v3, v3, v3 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11-NEXT: v_min_f32_e32 v0, v0, v1 +; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v2, v4 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v4 -; GFX11-NEXT: v_cmp_nge_f32_e64 s0, v1, v4 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test115: @@ -2434,39 +2455,45 @@ define i1 @test116(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_dual_max_f32 v9, v9, v9 :: v_dual_max_f32 v8, v8, v8 -; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 ; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: v_dual_max_f32 v5, v5, v5 :: v_dual_max_f32 v4, v4, v4 -; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v6, v6, v6 -; GFX11-NEXT: v_min_f32_e32 v8, v8, v9 -; GFX11-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5 -; GFX11-NEXT: v_max_f32_e32 v4, v6, v7 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_dual_min_f32 v8, v8, v9 :: v_dual_max_f32 v7, v7, v7 +; GFX11-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_max_f32 v5, v5, v5 +; GFX11-NEXT: v_max_f32_e32 v4, v4, v4 +; GFX11-NEXT: v_max_f32_e32 v6, v6, v6 ; GFX11-NEXT: v_min3_f32 v0, v0, v1, v8 ; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v10 -; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v3, v10 -; GFX11-NEXT: v_cmp_gt_f32_e64 s1, v4, v10 -; GFX11-NEXT: v_cmp_lt_f32_e64 s2, v0, v10 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s1, s2, vcc_lo -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_min_f32_e32 v1, v4, v5 +; GFX11-NEXT: v_max_f32_e32 v3, v6, v7 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v10 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v10 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v10 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test116: ; GFX11NONANS: ; %bb.0: ; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11NONANS-NEXT: v_min_f32_e32 v8, v8, v9 -; GFX11NONANS-NEXT: v_dual_max_f32 v2, v2, v3 :: v_dual_min_f32 v3, v4, v5 -; GFX11NONANS-NEXT: v_max_f32_e32 v4, v6, v7 +; GFX11NONANS-NEXT: v_max_f32_e32 v2, v2, v3 +; GFX11NONANS-NEXT: v_max_f32_e32 v3, v6, v7 ; GFX11NONANS-NEXT: v_min3_f32 v0, v0, v1, v8 +; GFX11NONANS-NEXT: v_min_f32_e32 v1, v4, v5 ; GFX11NONANS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v10 -; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v3, v10 -; GFX11NONANS-NEXT: v_cmp_gt_f32_e64 s1, v4, v10 -; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s2, v0, v10 -; GFX11NONANS-NEXT: s_or_b32 s0, s0, s1 -; GFX11NONANS-NEXT: s_or_b32 s1, s2, vcc_lo -; GFX11NONANS-NEXT: s_or_b32 s0, s0, s1 -; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v10 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_cmp_gt_f32_e32 vcc_lo, v3, v10 +; GFX11NONANS-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v10 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11NONANS-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C %cmp2 = fcmp olt float %arg2, %C @@ -2494,41 +2521,45 @@ define i1 @test117(float %arg1, float %arg2, float %arg3, float %arg4, float %ar ; GFX11-LABEL: test117: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v6, v6, v6 -; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v10, v10, v10 +; GFX11-NEXT: v_dual_max_f32 v3, v3, v3 :: v_dual_max_f32 v2, v2, v2 ; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 -; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v2, v2, v2 -; GFX11-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX11-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11 +; GFX11-NEXT: v_dual_max_f32 v7, v7, v7 :: v_dual_max_f32 v6, v6, v6 ; GFX11-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11-NEXT: v_min3_f32 v3, v4, v5, v6 +; GFX11-NEXT: v_dual_max_f32 v11, v11, v11 :: v_dual_max_f32 v10, v10, v10 +; GFX11-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v6, v7 +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v13 +; GFX11-NEXT: v_min_f32_e32 v3, v10, v11 +; GFX11-NEXT: v_min3_f32 v1, v4, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v12 -; GFX11-NEXT: v_min3_f32 v0, v8, v9, v1 -; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v2, v13 -; GFX11-NEXT: v_cmp_lt_f32_e64 s1, v3, v13 -; GFX11-NEXT: v_cmp_lt_f32_e64 s2, v0, v12 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_or_b32 s0, s0, s1 -; GFX11-NEXT: s_or_b32 s0, s2, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_min3_f32 v3, v8, v9, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v13 +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v12 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test117: ; GFX11NONANS: ; %bb.0: ; GFX11NONANS-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11NONANS-NEXT: v_min_f32_e32 v6, v6, v7 -; GFX11NONANS-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v10, v11 ; GFX11NONANS-NEXT: v_min_f32_e32 v2, v2, v3 -; GFX11NONANS-NEXT: v_min3_f32 v3, v4, v5, v6 +; GFX11NONANS-NEXT: v_dual_min_f32 v0, v0, v1 :: v_dual_min_f32 v1, v6, v7 +; GFX11NONANS-NEXT: v_min_f32_e32 v3, v10, v11 +; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v2, v13 +; GFX11NONANS-NEXT: v_min3_f32 v1, v4, v5, v1 +; GFX11NONANS-NEXT: v_min3_f32 v3, v8, v9, v3 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v12 -; GFX11NONANS-NEXT: v_min3_f32 v0, v8, v9, v1 -; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s0, v2, v13 -; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s1, v3, v13 -; GFX11NONANS-NEXT: v_cmp_lt_f32_e64 s2, v0, v12 -; GFX11NONANS-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11NONANS-NEXT: s_or_b32 s0, s0, s1 -; GFX11NONANS-NEXT: s_or_b32 s0, s2, s0 -; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v13 +; GFX11NONANS-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_cmp_lt_f32_e32 vcc_lo, v3, v12 +; GFX11NONANS-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11NONANS-NEXT: v_or3_b32 v0, v0, v1, v2 ; GFX11NONANS-NEXT: s_setpc_b64 s[30:31] %cmp1 = fcmp olt float %arg1, %C1 %cmp2 = fcmp olt float %arg2, %C1 @@ -2695,10 +2726,11 @@ define i1 @test124(i32 %arg1, i64 %arg2) { ; GCN-LABEL: test124: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0x3e8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0x3e8, v[1:2] -; GCN-NEXT: v_cmp_gt_i32_e64 s0, 0x3e8, v0 -; GCN-NEXT: s_or_b32 s0, s0, vcc_lo -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp slt i32 %arg1, 1000 %cmp2 = icmp slt i64 %arg2, 1000 @@ -2710,10 +2742,11 @@ define i1 @test125(i32 %arg1, i32 %arg2) { ; GCN-LABEL: test125: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x3e8, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x3e8, v0 -; GCN-NEXT: v_cmp_eq_u32_e64 s0, 0x3e8, v1 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp eq i32 %arg1, 1000 %cmp2 = icmp eq i32 %arg2, 1000 @@ -2725,10 +2758,11 @@ define i1 @test126(i32 %arg1, i32 %arg2) { ; GCN-LABEL: test126: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x3e8, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0x3e8, v0 -; GCN-NEXT: v_cmp_ne_u32_e64 s0, 0x3e8, v1 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ne i32 %arg1, 1000 %cmp2 = icmp ne i32 %arg2, 1000 @@ -2740,10 +2774,11 @@ define i1 @test127(i64 %arg1, i64 %arg2, i64 %arg3) { ; GCN-LABEL: test127: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[2:3], v[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_u64_e32 vcc_lo, v[0:1], v[4:5] -; GCN-NEXT: v_cmp_lt_u64_e64 s0, v[2:3], v[4:5] -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i64 %arg1, %arg3 %cmp2 = icmp ult i64 %arg2, %arg3 @@ -2755,10 +2790,11 @@ define i1 @test128(i32 %arg1, i32 %arg2, i32 %arg3) { ; GCN-LABEL: test128: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v2, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2 -; GCN-NEXT: v_cmp_lt_u32_e64 s0, v2, v1 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %arg3 %cmp2 = icmp ult i32 %arg3, %arg2 @@ -2770,10 +2806,11 @@ define i1 @test129(i32 %arg1, i32 %arg2, i32 %arg3) { ; GCN-LABEL: test129: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_le_u32_e32 vcc_lo, v1, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2 -; GCN-NEXT: v_cmp_le_u32_e64 s0, v1, v2 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %arg3 %cmp2 = icmp ule i32 %arg2, %arg3 @@ -2785,10 +2822,11 @@ define i1 @test130(i32 %arg1, i32 %arg2, i32 %arg3) { ; GCN-LABEL: test130: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, v1, v2 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_le_u32_e32 vcc_lo, v2, v0 -; GCN-NEXT: v_cmp_gt_u32_e64 s0, v1, v2 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ule i32 %arg3, %arg1 %cmp2 = icmp ugt i32 %arg2, %arg3 @@ -2801,36 +2839,40 @@ define i1 @test131(i16 %arg1, i32 %arg2) { ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-TRUE16-NEXT: v_cmp_gt_u32_e32 vcc_lo, 10, v1 -; GFX11-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 10, v0.l -; GFX11-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 10, v0.l +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-FAKE16-LABEL: test131: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-FAKE16-NEXT: v_cmp_gt_u32_e32 vcc_lo, 10, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 10, v0 -; GFX11-FAKE16-NEXT: v_cmp_gt_u32_e64 s0, 10, v1 -; GFX11-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GCN-TRUE16-LABEL: test131: ; GCN-TRUE16: ; %bb.0: ; GCN-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-TRUE16-NEXT: v_cmp_gt_u32_e32 vcc_lo, 10, v1 -; GCN-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 10, v0.l -; GCN-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GCN-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 10, v0.l +; GCN-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GCN-FAKE16-LABEL: test131: ; GCN-FAKE16: ; %bb.0: ; GCN-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-FAKE16-NEXT: v_cmp_gt_u32_e32 vcc_lo, 10, v1 +; GCN-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 10, v0 -; GCN-FAKE16-NEXT: v_cmp_gt_u32_e64 s0, 10, v1 -; GCN-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-FAKE16-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i16 %arg1, 10 %cmp2 = icmp ult i32 %arg2, 10 @@ -2843,12 +2885,13 @@ define i1 @test132(i32 %arg1, i32 %arg2, i32 %arg3, i32 %arg4) { ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v2 -; GCN-NEXT: v_cmp_lt_u32_e64 s0, v1, v2 -; GCN-NEXT: v_cmp_lt_u32_e64 s1, v0, v3 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: s_or_b32 s1, s1, vcc_lo -; GCN-NEXT: s_or_b32 s0, s0, s1 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_cmp_lt_u32_e32 vcc_lo, v1, v2 +; GCN-NEXT: v_or_b32_e32 v0, v0, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GCN-NEXT: v_or3_b32 v0, v4, v1, v0 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, %arg3 %cmp2 = icmp ult i32 %arg2, %arg3 @@ -2863,10 +2906,11 @@ define i1 @test133(i32 %arg1, i32 %arg2) { ; GCN-LABEL: test133: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x3e8, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GCN-NEXT: v_cmp_gt_u32_e32 vcc_lo, 0x64, v0 -; GCN-NEXT: v_cmp_gt_u32_e64 s0, 0x3e8, v1 -; GCN-NEXT: s_or_b32 s0, vcc_lo, s0 -; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 ; GCN-NEXT: s_setpc_b64 s[30:31] %cmp1 = icmp ult i32 %arg1, 100 %cmp2 = icmp ult i32 %arg2, 1000 @@ -2878,10 +2922,11 @@ define i1 @test134(float %arg1, float %arg2, float %arg3) #0 { ; GFX11-LABEL: test134: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_gt_f32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_gt_f32_e64 s0, v2, v1 -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test134: @@ -2901,10 +2946,11 @@ define i1 @test135(float %arg1, float %arg2, float %arg3) #0 { ; GFX11-LABEL: test135: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_nle_f32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nge_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_nle_f32_e64 s0, v2, v1 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test135: @@ -2924,12 +2970,13 @@ define i1 @test136(double %arg1, double %arg2, double %arg3) { ; GFX11-LABEL: test136: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_cmp_ge_f64_e32 vcc_lo, v[4:5], v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_ge_f64_e64 s0, v[4:5], v[2:3] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test136: @@ -2953,11 +3000,12 @@ define i1 @test137(float %arg1, float %arg2, float %arg3) { ; GFX11-LABEL: test137: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v2, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v2, v1 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test137: @@ -2979,10 +3027,11 @@ define i1 @test138(float %arg1, float %arg2, float %arg3) #0 { ; GFX11-LABEL: test138: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test138: @@ -3002,10 +3051,11 @@ define i1 @test139(double %arg1, double %arg2, double %arg3) #0 { ; GFX11-LABEL: test139: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test139: @@ -3025,10 +3075,11 @@ define i1 @test140(double %arg1, double %arg2, double %arg3) #0 { ; GFX11-LABEL: test140: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test140: @@ -3048,10 +3099,11 @@ define i1 @test141(float %arg1, float %arg2, float %arg3) #0 { ; GFX11-LABEL: test141: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_ge_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test141: @@ -3071,10 +3123,11 @@ define i1 @test142(double %arg1, double %arg2, double %arg3) #0 { ; GFX11-LABEL: test142: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_nle_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test142: @@ -3094,10 +3147,11 @@ define i1 @test143(float %arg1, float %arg2, float %arg3) #0 { ; GFX11-LABEL: test143: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test143: @@ -3117,10 +3171,11 @@ define i1 @test144(float %arg1, float %arg2, float %arg3) #0 { ; GFX11-LABEL: test144: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test144: @@ -3140,10 +3195,11 @@ define i1 @test145(double %arg1, double %arg2, double %arg3) #0 { ; GFX11-LABEL: test145: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_nge_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test145: @@ -3163,11 +3219,12 @@ define i1 @test146(float %arg1, float %arg2, float %arg3) { ; GFX11-LABEL: test146: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_lt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_lt_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test146: @@ -3189,12 +3246,13 @@ define i1 @test147(double %arg1, double %arg2, double %arg3) { ; GFX11-LABEL: test147: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_le_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_le_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test147: @@ -3218,12 +3276,13 @@ define i1 @test148(double %arg1, double %arg2, double %arg3) { ; GFX11-LABEL: test148: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_gt_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_gt_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test148: @@ -3247,11 +3306,12 @@ define i1 @test149(float %arg1, float %arg2, float %arg3) { ; GFX11-LABEL: test149: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ge_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_ge_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test149: @@ -3273,12 +3333,13 @@ define i1 @test150(double %arg1, double %arg2, double %arg3) { ; GFX11-LABEL: test150: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nle_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_nle_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test150: @@ -3302,11 +3363,12 @@ define i1 @test151(float %arg1, float %arg2, float %arg3) { ; GFX11-LABEL: test151: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nlt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_nlt_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test151: @@ -3328,11 +3390,12 @@ define i1 @test152(float %arg1, float %arg2, float %arg3) { ; GFX11-LABEL: test152: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_dual_max_f32 v0, v0, v0 :: v_dual_max_f32 v1, v1, v1 +; GFX11-NEXT: v_dual_max_f32 v1, v1, v1 :: v_dual_max_f32 v0, v0, v0 +; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v1, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ngt_f32_e32 vcc_lo, v0, v2 -; GFX11-NEXT: v_cmp_ngt_f32_e64 s0, v1, v2 -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test152: @@ -3354,12 +3417,13 @@ define i1 @test153(double %arg1, double %arg2, double %arg3) { ; GFX11-LABEL: test153: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] ; GFX11-NEXT: v_max_f64 v[2:3], v[2:3], v[2:3] +; GFX11-NEXT: v_max_f64 v[0:1], v[0:1], v[0:1] +; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[2:3], v[4:5] +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_nge_f64_e32 vcc_lo, v[0:1], v[4:5] -; GFX11-NEXT: v_cmp_nge_f64_e64 s0, v[2:3], v[4:5] -; GFX11-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX11NONANS-LABEL: test153: diff --git a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll index 11795cca18daa..a906563c76a30 100644 --- a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll +++ b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll @@ -5,9 +5,9 @@ define i16 @num_sign_bits_mul_i48_0(i8 %X, i8 %Y, i8 %Z, i8 %W) { ; GFX9-LABEL: num_sign_bits_mul_i48_0: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_mul_i32_i24_sdwa v1, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 -; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_mul_lo_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_mul_lo_u16_e32 v0, v0, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %A = sext i8 %X to i48 %B = sext i8 %Y to i48 @@ -24,8 +24,8 @@ define i16 @num_sign_bits_mul_i48_1(i8 %X, i8 %Y, i8 %Z, i8 %W) { ; GFX9-LABEL: num_sign_bits_mul_i48_1: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 +; GFX9-NEXT: v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0 ; GFX9-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v2 ; GFX9-NEXT: v_mul_i32_i24_e32 v0, v0, v2 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll index ed0a97c729c9c..1e7026abb7212 100644 --- a/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll +++ b/llvm/test/CodeGen/AMDGPU/copy_to_scc.ll @@ -15,13 +15,15 @@ define amdgpu_kernel void @copy_to_scc(ptr addrspace(1) %out, ptr addrspace(1) % ; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s2, 0 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_xor_b64 s[2:3], s[2:3], vcc -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s2, 2, 3 -; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GCN-NEXT: v_xor_b32_e32 v0, v2, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 3, 2, vcc ; GCN-NEXT: global_store_dword v1, v0, s[0:1] ; GCN-NEXT: s_endpgm entry: ; preds = %1009 diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll index 06c0417211809..63828482006d1 100644 --- a/llvm/test/CodeGen/AMDGPU/div_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll @@ -27,14 +27,15 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_ashrrev_i32_e32 v17, 31, v7 ; GFX9-NEXT: v_cndmask_b32_e32 v20, v5, v1, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v21, v4, v0, vcc +; GFX9-NEXT: v_or_b32_e32 v5, v9, v11 +; GFX9-NEXT: v_or_b32_e32 v4, v8, v10 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v7, v3, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v6, v2, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: v_or_b32_e32 v3, v20, v1 ; GFX9-NEXT: v_or_b32_e32 v2, v21, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_or_b32_e32 v3, v9, v11 -; GFX9-NEXT: v_or_b32_e32 v2, v8, v10 -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] ; GFX9-NEXT: v_ffbh_u32_e32 v2, v0 ; GFX9-NEXT: v_add_u32_e32 v2, 32, v2 ; GFX9-NEXT: v_ffbh_u32_e32 v3, v1 @@ -43,52 +44,55 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 ; GFX9-NEXT: v_ffbh_u32_e32 v4, v20 ; GFX9-NEXT: v_min_u32_e32 v3, v3, v4 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 64, v3 -; GFX9-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; GFX9-NEXT: v_ffbh_u32_e32 v6, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v5, v11 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_ffbh_u32_e32 v3, v10 ; GFX9-NEXT: v_add_u32_e32 v3, 32, v3 -; GFX9-NEXT: v_min_u32_e32 v3, v3, v6 -; GFX9-NEXT: v_ffbh_u32_e32 v6, v8 -; GFX9-NEXT: v_add_u32_e32 v6, 32, v6 -; GFX9-NEXT: v_ffbh_u32_e32 v7, v9 -; GFX9-NEXT: v_min_u32_e32 v6, v6, v7 +; GFX9-NEXT: v_min_u32_e32 v3, v3, v5 +; GFX9-NEXT: v_ffbh_u32_e32 v5, v8 +; GFX9-NEXT: v_add_u32_e32 v5, 32, v5 +; GFX9-NEXT: v_ffbh_u32_e32 v13, v9 +; GFX9-NEXT: v_min_u32_e32 v5, v5, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, 0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, 64, v6 -; GFX9-NEXT: v_addc_co_u32_e64 v7, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v5, vcc, 64, v5 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v12, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v3 -; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v7, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v5, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v5, vcc -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3] +; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v13, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v4, vcc, 0, v12, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, 0, v12, vcc +; GFX9-NEXT: s_mov_b64 s[4:5], 0x7f +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX9-NEXT: v_mov_b32_e32 v18, v16 -; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GFX9-NEXT: v_mov_b32_e32 v19, v17 -; GFX9-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc -; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GFX9-NEXT: v_or3_b32 v6, v7, v6, v12 +; GFX9-NEXT: v_xor_b32_e32 v14, 1, v6 +; GFX9-NEXT: v_and_b32_e32 v12, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v6, 0x7f, v2 -; GFX9-NEXT: v_or_b32_e32 v7, v3, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v6, v4 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc +; GFX9-NEXT: v_or_b32_e32 v7, v3, v5 +; GFX9-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[6:7] +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v6, v14, v6 +; GFX9-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v13, v11, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, v10, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, 0, vcc +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, 0, vcc ; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB0_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 @@ -224,8 +228,8 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 @@ -235,12 +239,12 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v0 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -251,285 +255,289 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v3 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v19, v10 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v8 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s10, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 1 +; GFX9-O0-NEXT: s_mov_b32 s8, s4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s8, 2 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s9, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, s8, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v3, vcc, v0, v2, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v13, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v0, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v0, v14, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s9 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v0, vcc, v0, v19, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v6 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[9:10], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[6:7], v[7:8], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v1, v2, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v19, v0, s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v13, v0, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v20 -; GFX9-O0-NEXT: v_mov_b32_e32 v14, v21 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v7, v8, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v7, v12, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v7, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v7, v14, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v14, v0, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v20 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v21 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v17, vcc, s8, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v9, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v9, v7, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v4, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v18 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[20:21], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v8, v9, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v17 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v14, v7, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v12, v7, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_xor_b32_e64 v14, v14, v19 -; GFX9-O0-NEXT: v_xor_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v18 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[6:7], v[20:21], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v17 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: s_mov_b32 s4, 63 -; GFX9-O0-NEXT: v_ashrrev_i64 v[12:13], s4, v[12:13] -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v4, v9, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, v19 +; GFX9-O0-NEXT: v_xor_b32_e64 v19, v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr19 killed $vgpr19 def $vgpr19_vgpr20 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v4 +; GFX9-O0-NEXT: s_mov_b32 s6, 63 +; GFX9-O0-NEXT: v_ashrrev_i64 v[19:20], s6, v[19:20] +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v12 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[17:18], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[14:15], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[12:13], s[6:7] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: s_mov_b32 s11, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v9, v9, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v9, v9, v10 +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: s_mov_b32 s13, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s12, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_add_u32_e64 v8, v8, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v11 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v8, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: s_mov_b32 s14, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: s_mov_b32 s16, s13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[14:15], v11, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v8, s[14:15], v8, v12, s[14:15] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v5, v6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v12, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v5, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: s_mov_b32 s10, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: s_mov_b32 s12, s13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[10:11], v12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[10:11], v5, v13, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v6, v8, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 -; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 -; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: s_mov_b32 s18, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s18 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr16 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-O0-NEXT: s_mov_b32 s12, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: s_mov_b32 s14, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[12:13], v10, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[5:6], s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[8:9], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7] +; GFX9-O0-NEXT: v_or3_b32 v7, v4, v7, v10 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v7, s6 +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s12, s11 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s12 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v3, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, 1 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -539,17 +547,17 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -581,9 +589,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -641,9 +649,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 @@ -672,9 +680,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 @@ -866,9 +874,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill @@ -897,9 +905,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload @@ -999,9 +1007,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill @@ -1028,9 +1036,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1157,9 +1165,9 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 @@ -2307,65 +2315,69 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_udiv_i128_vv: ; GFX9: ; %bb.0: ; %_udiv-special-cases ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v11, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v10, v0, v2 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_or_b32_e32 v9, v5, v7 ; GFX9-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v9, v1, v3 -; GFX9-NEXT: v_or_b32_e32 v8, v0, v2 -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] -; GFX9-NEXT: v_ffbh_u32_e32 v8, v6 -; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 -; GFX9-NEXT: v_ffbh_u32_e32 v9, v7 -; GFX9-NEXT: v_min_u32_e32 v8, v8, v9 -; GFX9-NEXT: v_ffbh_u32_e32 v9, v4 -; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 -; GFX9-NEXT: v_ffbh_u32_e32 v10, v5 -; GFX9-NEXT: v_min_u32_e32 v9, v9, v10 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_ffbh_u32_e32 v11, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc -; GFX9-NEXT: v_ffbh_u32_e32 v9, v2 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v6 ; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v11, v7 ; GFX9-NEXT: v_min_u32_e32 v9, v9, v11 -; GFX9-NEXT: v_ffbh_u32_e32 v11, v0 +; GFX9-NEXT: v_ffbh_u32_e32 v11, v4 ; GFX9-NEXT: v_add_u32_e32 v11, 32, v11 -; GFX9-NEXT: v_ffbh_u32_e32 v12, v1 +; GFX9-NEXT: v_ffbh_u32_e32 v12, v5 ; GFX9-NEXT: v_min_u32_e32 v11, v11, v12 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 64, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX9-NEXT: v_addc_co_u32_e64 v12, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] +; GFX9-NEXT: v_ffbh_u32_e32 v14, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_ffbh_u32_e32 v11, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v13, v12, 0, vcc -; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v8, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v10, v13, vcc -; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v8, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v8, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[12:13] -; GFX9-NEXT: v_or_b32_e32 v10, v13, v15 -; GFX9-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_add_u32_e32 v11, 32, v11 +; GFX9-NEXT: v_ffbh_u32_e32 v12, v3 +; GFX9-NEXT: v_min_u32_e32 v11, v11, v12 +; GFX9-NEXT: v_ffbh_u32_e32 v12, v0 +; GFX9-NEXT: v_add_u32_e32 v12, 32, v12 +; GFX9-NEXT: v_min_u32_e32 v12, v12, v14 +; GFX9-NEXT: v_add_co_u32_e32 v12, vcc, 64, v12 +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[4:5], 0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: s_mov_b64 s[4:5], 0x7f +; GFX9-NEXT: v_cndmask_b32_e32 v11, v12, v11, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc +; GFX9-NEXT: v_sub_co_u32_e32 v12, vcc, v9, v11 +; GFX9-NEXT: v_subb_co_u32_e32 v13, vcc, v13, v14, vcc +; GFX9-NEXT: v_mov_b32_e32 v9, 0 +; GFX9-NEXT: v_subb_co_u32_e32 v14, vcc, 0, v9, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v15, vcc, 0, v9, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[12:13] +; GFX9-NEXT: v_or_b32_e32 v17, v13, v15 ; GFX9-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc +; GFX9-NEXT: v_or3_b32 v8, v8, v10, v9 +; GFX9-NEXT: v_xor_b32_e32 v18, 1, v8 ; GFX9-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX9-NEXT: v_xor_b32_e32 v9, 0x7f, v12 +; GFX9-NEXT: v_or_b32_e32 v16, v9, v14 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 -; GFX9-NEXT: v_or_b32_e32 v9, v9, v14 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10] -; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v8, v3, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v9, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v10, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, v0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX9-NEXT: v_and_b32_e32 v16, v18, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12 @@ -2492,33 +2504,33 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v17 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -2528,189 +2540,193 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[0:1], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[10:11], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: s_mov_b32 s7, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v6, v9 +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v9, v5, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: s_mov_b32 s10, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: s_mov_b32 s12, s9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[10:11], v8, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[10:11], v5, v8, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[10:11], v[16:17], s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v6, v8, s[10:11] ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 -; GFX9-O0-NEXT: s_mov_b32 s14, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: s_mov_b32 s8, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 -; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9] ; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v12, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: s_mov_b32 s6, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: s_mov_b32 s8, s9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v12, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v13, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[14:15], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v6, v8, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: s_mov_b32 s8, s4 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[5:6], s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[8:9], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7] +; GFX9-O0-NEXT: v_or3_b32 v7, v4, v7, v10 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v7, s6 +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s12, s11 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s12 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v3, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, 1 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -2720,17 +2736,17 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -2762,9 +2778,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 @@ -2822,9 +2838,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -2853,9 +2869,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 @@ -3047,9 +3063,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -3078,9 +3094,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload @@ -3180,9 +3196,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill @@ -3209,9 +3225,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -3338,9 +3354,9 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll index 77b78f1f8a333..2107351c81b9d 100644 --- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll +++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll @@ -10,7 +10,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v18, 0 ; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3 ; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v26, v24 ; SDAG-NEXT: v_mov_b32_e32 v27, v25 @@ -22,71 +22,76 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, v19, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v20 ; SDAG-NEXT: v_ffbh_u32_e32 v2, v21 +; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v17, v3, v0, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v1 +; SDAG-NEXT: v_ffbh_u32_e32 v3, v16 ; SDAG-NEXT: v_or_b32_e32 v0, v20, v16 -; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0, v8 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v16 +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, 0, v9, vcc +; SDAG-NEXT: v_min_u32_e32 v2, v1, v2 +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 32, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v17 ; SDAG-NEXT: v_or_b32_e32 v1, v21, v17 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc -; SDAG-NEXT: v_min_u32_e32 v2, v19, v2 -; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v17 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; SDAG-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v23, s[6:7] -; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v3, s[6:7] -; SDAG-NEXT: v_min_u32_e32 v1, v19, v22 -; SDAG-NEXT: v_add_i32_e64 v2, s[8:9], 64, v2 -; SDAG-NEXT: v_addc_u32_e64 v3, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, 0, v11, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v0, s[6:7] +; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v28, v9, v22, s[4:5] +; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v29, v8, v19, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v3, v3, v23 +; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2 +; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] +; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v0, v10, v9, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v2, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v9, v2, v3, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v3, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v28 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v8, s[6:7] +; SDAG-NEXT: v_ffbh_u32_e32 v10, v28 +; SDAG-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v2, v29, v0 -; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v3 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v0 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v3 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 ; SDAG-NEXT: v_or_b32_e32 v3, v28, v1 -; SDAG-NEXT: v_min_u32_e32 v8, v8, v19 -; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11 -; SDAG-NEXT: v_ffbh_u32_e32 v19, v1 +; SDAG-NEXT: v_min_u32_e32 v10, v11, v10 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 +; SDAG-NEXT: v_min_u32_e32 v11, v11, v22 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v2, v11, v19 -; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 64, v8 -; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, s[6:7] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v3, v2, s[6:7] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v10 -; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v8, v9, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v22, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v10, v11, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v2, v19 +; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v10, v9 +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v8, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v2 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v18, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v10 ; SDAG-NEXT: v_or_b32_e32 v9, v3, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v22, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v18, v19, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v18, 1, v9 +; SDAG-NEXT: v_and_b32_e32 v9, 1, v9 +; SDAG-NEXT: v_and_b32_e32 v8, v18, v8 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, vcc ; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v17, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v22, v16, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v21, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, s[4:5] -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v23, v20, 0, vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB0_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v2 @@ -207,7 +212,7 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ashrrev_i32_e32 v17, 31, v15 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 0, v4 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v20, v16 ; SDAG-NEXT: v_mov_b32_e32 v21, v17 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v5, vcc @@ -216,102 +221,107 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v3, v5, v1, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, v4, v0, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v0, vcc, 0, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, v6, v9, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v1, v2 -; SDAG-NEXT: v_ffbh_u32_e32 v4, v3 -; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v0, s[4:5] -; SDAG-NEXT: v_sub_i32_e32 v5, vcc, 0, v12 -; SDAG-NEXT: v_or_b32_e32 v0, v2, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v9, v6 +; SDAG-NEXT: v_ffbh_u32_e32 v6, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v5, v7, v0, s[4:5] +; SDAG-NEXT: v_sub_i32_e32 v7, vcc, 0, v12 +; SDAG-NEXT: v_or_b32_e32 v0, v2, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v9, v4 ; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v1 ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v13, vcc -; SDAG-NEXT: v_or_b32_e32 v1, v3, v7 +; SDAG-NEXT: v_or_b32_e32 v1, v3, v5 ; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v30, v7 -; SDAG-NEXT: v_min_u32_e32 v4, v10, v4 +; SDAG-NEXT: v_ffbh_u32_e32 v30, v5 +; SDAG-NEXT: v_min_u32_e32 v6, v10, v6 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v14, vcc ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[14:15] ; SDAG-NEXT: v_cndmask_b32_e64 v28, v13, v11, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v29, v12, v7, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[0:1] +; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[6:7] ; SDAG-NEXT: v_min_u32_e32 v1, v9, v30 -; SDAG-NEXT: v_add_i32_e64 v4, s[8:9], 64, v4 -; SDAG-NEXT: v_addc_u32_e64 v5, s[8:9], 0, 0, s[8:9] +; SDAG-NEXT: v_add_i32_e64 v6, s[6:7], 64, v6 +; SDAG-NEXT: v_addc_u32_e64 v7, s[6:7], 0, 0, s[6:7] ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v15, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v0, v14, v10, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v10, v29 -; SDAG-NEXT: v_ffbh_u32_e32 v11, v28 -; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v5, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v13, v4, v1, vcc +; SDAG-NEXT: v_ffbh_u32_e32 v12, v28 +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v14, v6, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v1, v15, v9, s[4:5] -; SDAG-NEXT: v_or_b32_e32 v4, v29, v0 +; SDAG-NEXT: v_or_b32_e32 v6, v29, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v9, v0 ; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v10 -; SDAG-NEXT: v_or_b32_e32 v5, v28, v1 +; SDAG-NEXT: v_or_b32_e32 v7, v28, v1 ; SDAG-NEXT: v_add_i32_e32 v9, vcc, 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v14, v1 -; SDAG-NEXT: v_min_u32_e32 v10, v10, v11 -; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_min_u32_e32 v4, v9, v14 -; SDAG-NEXT: v_add_i32_e64 v5, s[4:5], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_ffbh_u32_e32 v15, v1 +; SDAG-NEXT: v_min_u32_e32 v10, v10, v12 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SDAG-NEXT: v_min_u32_e32 v7, v9, v15 +; SDAG-NEXT: v_add_i32_e32 v9, vcc, 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_or_b32_e32 v12, v6, v11 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v13 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v12, vcc -; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v6, v9, v7, vcc +; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v14 +; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v13, vcc +; SDAG-NEXT: v_xor_b32_e32 v9, 0x7f, v6 ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v8, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v11, vcc, 0, v8, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v9, v10 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc -; SDAG-NEXT: v_or_b32_e32 v9, v5, v11 +; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v7, v11 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v13, v14, v13, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_and_b32_e32 v8, 1, v12 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v7, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v12, v13 +; SDAG-NEXT: v_xor_b32_e32 v12, 1, v9 +; SDAG-NEXT: v_and_b32_e32 v9, 1, v9 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v14, v12, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v12, v4, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v14, 1, v14 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB0_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 -; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v4 -; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v4 +; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6 +; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6 ; SDAG-NEXT: v_mov_b32_e32 v8, 0 ; SDAG-NEXT: v_mov_b32_e32 v9, 0 -; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v5, vcc +; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc ; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12 ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v10, vcc ; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc ; SDAG-NEXT: v_or_b32_e32 v10, v30, v32 -; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v4 +; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v6 ; SDAG-NEXT: v_or_b32_e32 v11, v31, v33 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[6:7], v34 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[4:5], v34 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34 ; SDAG-NEXT: v_lshl_b64 v[14:15], v[2:3], v34 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] ; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v35 -; SDAG-NEXT: v_or_b32_e32 v5, v5, v11 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v10 +; SDAG-NEXT: v_or_b32_e32 v7, v7, v11 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v10 ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v13, v5, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v12, v4, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v13, v7, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v15, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v14, s[4:5] ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34 -; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v7, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v6, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v7, v7, v5, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v6, v6, v4, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -321,24 +331,24 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_lshr_b64 v[8:9], v[2:3], v30 ; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30 ; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30 -; SDAG-NEXT: v_lshr_b64 v[37:38], v[6:7], v30 +; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v29 ; SDAG-NEXT: v_mov_b32_e32 v12, 0 ; SDAG-NEXT: v_mov_b32_e32 v13, 0 ; SDAG-NEXT: v_mov_b32_e32 v14, 0 ; SDAG-NEXT: v_mov_b32_e32 v15, 0 ; SDAG-NEXT: s_mov_b64 s[10:11], 0 -; SDAG-NEXT: v_lshl_b64 v[48:49], v[6:7], v35 -; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v36 +; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35 +; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36 ; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v9, v9, v49 ; SDAG-NEXT: v_or_b32_e32 v8, v8, v48 ; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc ; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v7, v9, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v6, v8, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v38, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v37, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v5, v9, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, v4, v8, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5] ; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc ; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30 ; SDAG-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc @@ -346,23 +356,23 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_mov_b32_e32 v9, 0 ; SDAG-NEXT: .LBB0_9: ; %udiv-do-while ; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1 -; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 +; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v3 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1 -; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v5 -; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1 +; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v7 +; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1 -; SDAG-NEXT: v_or_b32_e32 v6, v6, v8 +; SDAG-NEXT: v_or_b32_e32 v4, v4, v8 ; SDAG-NEXT: v_or_b32_e32 v2, v2, v38 -; SDAG-NEXT: v_or_b32_e32 v4, v4, v39 -; SDAG-NEXT: v_or_b32_e32 v5, v13, v5 +; SDAG-NEXT: v_or_b32_e32 v6, v6, v39 +; SDAG-NEXT: v_or_b32_e32 v7, v13, v7 ; SDAG-NEXT: v_or_b32_e32 v11, v15, v11 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v34, v2 -; SDAG-NEXT: v_or_b32_e32 v4, v12, v4 +; SDAG-NEXT: v_or_b32_e32 v6, v12, v6 ; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v35, v3, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v6, vcc -; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v36, v4, vcc +; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v37, v5, vcc ; SDAG-NEXT: v_ashrrev_i32_e32 v8, 31, v8 ; SDAG-NEXT: v_and_b32_e32 v15, v8, v29 ; SDAG-NEXT: v_and_b32_e32 v38, v8, v28 @@ -370,8 +380,8 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_and_b32_e32 v48, v8, v1 ; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15 ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v38, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v39, vcc -; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v48, vcc +; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v48, vcc ; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30 ; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc ; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc @@ -390,13 +400,13 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_or_b64 exec, exec, s[10:11] ; SDAG-NEXT: .LBB0_11: ; %Flow11 ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] -; SDAG-NEXT: v_lshl_b64 v[0:1], v[4:5], 1 +; SDAG-NEXT: v_lshl_b64 v[0:1], v[6:7], 1 ; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v11 ; SDAG-NEXT: v_lshl_b64 v[2:3], v[10:11], 1 ; SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; SDAG-NEXT: v_or_b32_e32 v13, v13, v1 -; SDAG-NEXT: v_or_b32_e32 v14, v9, v3 -; SDAG-NEXT: v_or_b32_e32 v9, v12, v0 +; SDAG-NEXT: v_or_b32_e32 v9, v9, v3 +; SDAG-NEXT: v_or_b32_e32 v12, v12, v0 ; SDAG-NEXT: v_or_b32_e32 v8, v8, v2 ; SDAG-NEXT: .LBB0_12: ; %Flow12 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] @@ -409,16 +419,16 @@ define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_xor_b32_e32 v1, v19, v3 ; SDAG-NEXT: v_xor_b32_e32 v0, v23, v2 ; SDAG-NEXT: v_xor_b32_e32 v10, v13, v7 -; SDAG-NEXT: v_xor_b32_e32 v9, v9, v6 -; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7 +; SDAG-NEXT: v_xor_b32_e32 v11, v12, v6 +; SDAG-NEXT: v_xor_b32_e32 v9, v9, v7 ; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2 ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc ; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc ; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc ; SDAG-NEXT: v_xor_b32_e32 v4, v8, v6 ; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc -; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc +; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v9, v7, vcc +; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v11, v6, vcc ; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v10, v7, vcc ; SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -836,8 +846,6 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 ; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 -; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 -; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 @@ -847,52 +855,59 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: s_mov_b64 s[4:5], 0x7f +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 32, v22 +; SDAG-NEXT: v_add_i32_e32 v24, vcc, 32, v24 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, 32, v26 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 -; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 -; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 -; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 -; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 -; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 -; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc -; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 -; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SDAG-NEXT: v_min_u32_e32 v17, v20, v21 +; SDAG-NEXT: v_min_u32_e32 v19, v22, v23 +; SDAG-NEXT: v_min_u32_e32 v20, v24, v25 +; SDAG-NEXT: v_min_u32_e32 v21, v26, v27 +; SDAG-NEXT: v_or_b32_e32 v18, v16, v18 +; SDAG-NEXT: v_add_i32_e32 v16, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v19, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v21, vcc, 64, v21 +; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v19, v19, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc -; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v20, v17, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v17, v22, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc +; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v16, v20 +; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v19, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v22 ; SDAG-NEXT: v_subb_u32_e32 v24, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[22:23] -; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[22:23] +; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v25, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v24 ; SDAG-NEXT: v_or_b32_e32 v17, v23, v25 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[24:25] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[24:25] +; SDAG-NEXT: v_cndmask_b32_e32 v19, v20, v19, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[24:25] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v19, v18, s[4:5] -; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5] -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v18, v19 +; SDAG-NEXT: v_xor_b32_e32 v18, 1, v17 +; SDAG-NEXT: v_and_b32_e32 v17, 1, v17 +; SDAG-NEXT: v_and_b32_e32 v18, v18, v16 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v19, 1, v18 +; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB1_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v26, vcc, 1, v22 @@ -1022,22 +1037,24 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v22, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v23, v5 ; SDAG-NEXT: v_mov_b32_e32 v24, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[4:5], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8 -; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v10 -; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20 -; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_min_u32_e32 v0, v0, v9 -; SDAG-NEXT: v_min_u32_e32 v1, v1, v11 -; SDAG-NEXT: v_min_u32_e32 v2, v2, v21 -; SDAG-NEXT: v_min_u32_e32 v3, v3, v23 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_add_i32_e32 v1, vcc, 32, v8 +; SDAG-NEXT: v_add_i32_e32 v3, vcc, 32, v10 +; SDAG-NEXT: v_add_i32_e32 v8, vcc, 32, v20 +; SDAG-NEXT: v_add_i32_e32 v10, vcc, 32, v22 +; SDAG-NEXT: v_or_b32_e32 v22, v0, v2 +; SDAG-NEXT: v_min_u32_e32 v0, v1, v9 +; SDAG-NEXT: v_min_u32_e32 v1, v3, v11 +; SDAG-NEXT: v_min_u32_e32 v2, v8, v21 +; SDAG-NEXT: v_min_u32_e32 v3, v10, v23 ; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1 -; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_addc_u32_e64 v8, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3 -; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_addc_u32_e64 v9, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] ; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -1048,7 +1065,7 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc ; SDAG-NEXT: v_xor_b32_e32 v2, 0x7f, v0 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v24, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1] ; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v24, vcc ; SDAG-NEXT: v_or_b32_e32 v2, v2, v20 @@ -1058,16 +1075,19 @@ define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; SDAG-NEXT: v_and_b32_e32 v2, 1, v8 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v3, v22, v8 +; SDAG-NEXT: v_xor_b32_e32 v9, 1, v3 +; SDAG-NEXT: v_and_b32_e32 v3, 1, v3 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v2, v9, v2 +; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB1_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v22, vcc, 1, v0 @@ -1564,7 +1584,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 0, v0 ; SDAG-NEXT: v_mov_b32_e32 v19, 0 ; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, 0, v1, vcc ; SDAG-NEXT: v_mov_b32_e32 v29, v28 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v2, vcc @@ -1573,73 +1593,78 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v16, v0, v16, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v1, vcc, 0, v3, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v0, v2, v18, s[4:5] -; SDAG-NEXT: v_ffbh_u32_e32 v18, v16 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v17 -; SDAG-NEXT: v_sub_i32_e32 v21, vcc, 0, v8 +; SDAG-NEXT: v_ffbh_u32_e32 v2, v16 +; SDAG-NEXT: v_ffbh_u32_e32 v18, v17 +; SDAG-NEXT: v_sub_i32_e32 v20, vcc, 0, v8 ; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], 32, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v0 ; SDAG-NEXT: v_or_b32_e32 v2, v16, v0 -; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v0 -; SDAG-NEXT: v_subb_u32_e32 v23, vcc, 0, v9, vcc +; SDAG-NEXT: v_subb_u32_e32 v22, vcc, 0, v9, vcc +; SDAG-NEXT: v_min_u32_e32 v18, v3, v18 +; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], 32, v21 +; SDAG-NEXT: v_ffbh_u32_e32 v23, v1 ; SDAG-NEXT: v_or_b32_e32 v3, v17, v1 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v20 -; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], 32, v22 -; SDAG-NEXT: v_ffbh_u32_e32 v22, v1 ; SDAG-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v23, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v30, v9, v22, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v10, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v21, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v31, v8, v20, s[4:5] +; SDAG-NEXT: v_min_u32_e32 v8, v21, v23 +; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 64, v18 +; SDAG-NEXT: v_addc_u32_e64 v20, s[6:7], 0, 0, s[6:7] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3] -; SDAG-NEXT: v_min_u32_e32 v3, v20, v22 -; SDAG-NEXT: v_add_i32_e64 v8, s[8:9], 64, v18 -; SDAG-NEXT: v_addc_u32_e64 v18, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v11, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v3, vcc, 0, v11, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v9, s[4:5] ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v8, v3, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v18, v8, vcc ; SDAG-NEXT: v_ffbh_u32_e32 v9, v31 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v30 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v20, s[4:5] +; SDAG-NEXT: v_ffbh_u32_e32 v18, v30 +; SDAG-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v8, v31, v2 ; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v9 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v2 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v2 ; SDAG-NEXT: v_or_b32_e32 v9, v30, v3 -; SDAG-NEXT: v_min_u32_e32 v11, v11, v21 -; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 -; SDAG-NEXT: v_ffbh_u32_e32 v21, v3 +; SDAG-NEXT: v_min_u32_e32 v11, v11, v18 +; SDAG-NEXT: v_add_i32_e32 v18, vcc, 32, v22 +; SDAG-NEXT: v_ffbh_u32_e32 v22, v3 +; SDAG-NEXT: v_min_u32_e32 v18, v18, v22 +; SDAG-NEXT: v_add_i32_e32 v11, vcc, 64, v11 +; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_min_u32_e32 v8, v20, v21 -; SDAG-NEXT: v_add_i32_e64 v9, s[4:5], 64, v11 -; SDAG-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_cmp_ne_u64_e64 s[4:5], 0, v[2:3] -; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v8, v10 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v18, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] +; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, v11, v18, vcc +; SDAG-NEXT: v_or_b32_e32 v21, v8, v21 +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v11, v10 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v9, v20, vcc ; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v10 ; SDAG-NEXT: v_subb_u32_e32 v18, vcc, 0, v19, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[10:11] ; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v19, vcc ; SDAG-NEXT: v_or_b32_e32 v8, v8, v18 ; SDAG-NEXT: v_or_b32_e32 v9, v11, v19 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e32 v20, v22, v20, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_cndmask_b32_e64 v8, v21, v20, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v9, v21, v20 +; SDAG-NEXT: v_xor_b32_e32 v20, 1, v9 +; SDAG-NEXT: v_and_b32_e32 v9, 1, v9 +; SDAG-NEXT: v_and_b32_e32 v8, v20, v8 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, vcc ; SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v34, v1, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v27, v17, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, s[4:5] -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v33, v16, 0, vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10 @@ -1759,7 +1784,7 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7 ; SDAG-NEXT: v_sub_i32_e32 v8, vcc, 0, v4 ; SDAG-NEXT: v_mov_b32_e32 v18, 0 -; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f +; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f ; SDAG-NEXT: v_mov_b32_e32 v35, v26 ; SDAG-NEXT: v_subb_u32_e32 v9, vcc, 0, v5, vcc ; SDAG-NEXT: v_subb_u32_e32 v10, vcc, 0, v6, vcc @@ -1785,56 +1810,61 @@ define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cndmask_b32_e64 v36, v13, v21, s[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v37, v12, v19, s[4:5] ; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[6:7] ; SDAG-NEXT: v_min_u32_e32 v7, v20, v22 -; SDAG-NEXT: v_add_i32_e64 v10, s[8:9], 64, v10 -; SDAG-NEXT: v_addc_u32_e64 v12, s[8:9], 0, 0, s[8:9] -; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v15, vcc +; SDAG-NEXT: v_add_i32_e64 v10, s[6:7], 64, v10 +; SDAG-NEXT: v_addc_u32_e64 v13, s[6:7], 0, 0, s[6:7] +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, 0, v15, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v6, v14, v11, s[4:5] ; SDAG-NEXT: v_ffbh_u32_e32 v11, v37 ; SDAG-NEXT: v_ffbh_u32_e32 v14, v36 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v19, v10, v7, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v13, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v20, v10, v7, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v7, v15, v19, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v10, v37, v6 -; SDAG-NEXT: v_ffbh_u32_e32 v13, v6 -; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v11 +; SDAG-NEXT: v_ffbh_u32_e32 v15, v6 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v11 ; SDAG-NEXT: v_or_b32_e32 v11, v36, v7 -; SDAG-NEXT: v_add_i32_e32 v13, vcc, 32, v13 -; SDAG-NEXT: v_ffbh_u32_e32 v20, v7 -; SDAG-NEXT: v_min_u32_e32 v14, v15, v14 +; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15 +; SDAG-NEXT: v_ffbh_u32_e32 v21, v7 +; SDAG-NEXT: v_min_u32_e32 v14, v19, v14 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_min_u32_e32 v10, v13, v20 -; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v14 -; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SDAG-NEXT: v_min_u32_e32 v11, v15, v21 +; SDAG-NEXT: v_add_i32_e32 v14, vcc, 64, v14 +; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v10, v12 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc -; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v19 -; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v13, v12, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v12, v15, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v10, v14, v11, vcc +; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v20 +; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v12, v13, vcc ; SDAG-NEXT: v_xor_b32_e32 v14, 0x7f, v10 ; SDAG-NEXT: v_subb_u32_e32 v12, vcc, 0, v18, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[10:11] +; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v13, vcc, 0, v18, vcc ; SDAG-NEXT: v_or_b32_e32 v14, v14, v12 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] ; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; SDAG-NEXT: v_or_b32_e32 v15, v11, v13 ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13] -; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v19, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v18, v20, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] -; SDAG-NEXT: v_and_b32_e32 v14, 1, v18 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v14 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v15, v19, v18 +; SDAG-NEXT: v_xor_b32_e32 v18, 1, v15 +; SDAG-NEXT: v_and_b32_e32 v15, 1, v15 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v20, v18, v14 +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v15, v9, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v14, v8, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v20, 1, v20 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v20 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB2_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v10 @@ -2466,8 +2496,6 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; SDAG-NEXT: v_or_b32_e32 v17, v9, v11 ; SDAG-NEXT: v_or_b32_e32 v16, v8, v10 -; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 -; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 ; SDAG-NEXT: v_ffbh_u32_e32 v20, v10 ; SDAG-NEXT: v_ffbh_u32_e32 v21, v11 ; SDAG-NEXT: v_ffbh_u32_e32 v22, v8 @@ -2477,52 +2505,59 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v0 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v1 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: v_or_b32_e32 v19, v1, v3 +; SDAG-NEXT: v_or_b32_e32 v18, v0, v2 +; SDAG-NEXT: s_mov_b64 s[4:5], 0x7f +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v20 +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 32, v22 +; SDAG-NEXT: v_add_i32_e32 v24, vcc, 32, v24 +; SDAG-NEXT: v_add_i32_e32 v26, vcc, 32, v26 +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 -; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 -; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 -; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 -; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 -; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 -; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc -; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 -; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SDAG-NEXT: v_min_u32_e32 v17, v20, v21 +; SDAG-NEXT: v_min_u32_e32 v19, v22, v23 +; SDAG-NEXT: v_min_u32_e32 v20, v24, v25 +; SDAG-NEXT: v_min_u32_e32 v21, v26, v27 +; SDAG-NEXT: v_or_b32_e32 v22, v16, v18 +; SDAG-NEXT: v_add_i32_e32 v16, vcc, 64, v19 +; SDAG-NEXT: v_addc_u32_e64 v18, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v21 +; SDAG-NEXT: v_addc_u32_e64 v21, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v23, v18, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v16, v16, v17, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v20, vcc ; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v16, v18 -; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v20, v17, vcc +; SDAG-NEXT: v_subb_u32_e32 v19, vcc, v23, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v18 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[18:19] -; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v16, v16, v20 ; SDAG-NEXT: v_or_b32_e32 v17, v19, v21 ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] +; SDAG-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[20:21] -; SDAG-NEXT: v_cndmask_b32_e64 v16, v23, v22, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v17, v22, v23 +; SDAG-NEXT: v_xor_b32_e32 v22, 1, v17 +; SDAG-NEXT: v_and_b32_e32 v17, 1, v17 +; SDAG-NEXT: v_and_b32_e32 v16, v22, v16 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, vcc ; SDAG-NEXT: v_and_b32_e32 v16, 1, v16 +; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, vcc ; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[8:9], s[6:7], vcc -; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5] -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[8:9] +; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB3_6 ; SDAG-NEXT: ; %bb.1: ; %udiv-bb15 ; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v18 @@ -2652,22 +2687,24 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_ffbh_u32_e32 v26, v4 ; SDAG-NEXT: v_ffbh_u32_e32 v27, v5 ; SDAG-NEXT: v_mov_b32_e32 v28, 0 -; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f +; SDAG-NEXT: s_mov_b64 s[4:5], 0x7f +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17] -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19] -; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20 -; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22 -; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24 -; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26 -; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5] -; SDAG-NEXT: v_min_u32_e32 v16, v16, v21 -; SDAG-NEXT: v_min_u32_e32 v17, v17, v23 -; SDAG-NEXT: v_min_u32_e32 v18, v18, v25 -; SDAG-NEXT: v_min_u32_e32 v19, v19, v27 +; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SDAG-NEXT: v_add_i32_e32 v17, vcc, 32, v20 +; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v22 +; SDAG-NEXT: v_add_i32_e32 v20, vcc, 32, v24 +; SDAG-NEXT: v_add_i32_e32 v22, vcc, 32, v26 +; SDAG-NEXT: v_or_b32_e32 v24, v16, v18 +; SDAG-NEXT: v_min_u32_e32 v16, v17, v21 +; SDAG-NEXT: v_min_u32_e32 v17, v19, v23 +; SDAG-NEXT: v_min_u32_e32 v18, v20, v25 +; SDAG-NEXT: v_min_u32_e32 v19, v22, v27 ; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17 -; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_addc_u32_e64 v20, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19 -; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc +; SDAG-NEXT: v_addc_u32_e64 v21, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] ; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc ; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc @@ -2678,7 +2715,7 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc ; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16 ; SDAG-NEXT: v_subb_u32_e32 v20, vcc, 0, v28, vcc -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17] +; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[16:17] ; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5] ; SDAG-NEXT: v_subb_u32_e32 v21, vcc, 0, v28, vcc ; SDAG-NEXT: v_or_b32_e32 v18, v18, v20 @@ -2688,16 +2725,19 @@ define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21] ; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc ; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19] -; SDAG-NEXT: v_and_b32_e32 v18, 1, v22 -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 -; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5] -; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SDAG-NEXT: v_or_b32_e32 v19, v24, v22 +; SDAG-NEXT: v_xor_b32_e32 v22, 1, v19 +; SDAG-NEXT: v_and_b32_e32 v19, 1, v19 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v19 +; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v24, v22, v18 +; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, vcc +; SDAG-NEXT: v_and_b32_e32 v24, 1, v24 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v24 +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_cbranch_execz .LBB3_12 ; SDAG-NEXT: ; %bb.7: ; %udiv-bb1 ; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16 diff --git a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll index 72913d2596ebf..231b7afda1a15 100644 --- a/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll +++ b/llvm/test/CodeGen/AMDGPU/exec-mask-opt-cannot-create-empty-or-backward-segment.ll @@ -4,13 +4,13 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 %arg1, i1 %arg2, i1 %arg3, i1 %arg4, i1 %arg5) { ; CHECK-LABEL: cannot_create_empty_or_backwards_segment: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b64 s[26:27], s[2:3] -; CHECK-NEXT: s_mov_b64 s[24:25], s[0:1] +; CHECK-NEXT: s_mov_b64 s[30:31], s[2:3] +; CHECK-NEXT: s_mov_b64 s[28:29], s[0:1] ; CHECK-NEXT: s_load_dword s2, s[8:9], 0x0 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 ; CHECK-NEXT: s_load_dword s6, s[8:9], 0x4 -; CHECK-NEXT: s_add_u32 s24, s24, s17 -; CHECK-NEXT: s_addc_u32 s25, s25, 0 +; CHECK-NEXT: s_add_u32 s28, s28, s17 +; CHECK-NEXT: s_addc_u32 s29, s29, 0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s2, 0 ; CHECK-NEXT: s_cselect_b64 s[16:17], -1, 0 @@ -18,9 +18,12 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_cselect_b64 s[10:11], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s2, 16 ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 +; CHECK-NEXT: s_lshr_b32 s4, s0, 24 ; CHECK-NEXT: s_bitcmp1_b32 s0, 24 ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 -; CHECK-NEXT: s_xor_b64 s[4:5], s[8:9], -1 +; CHECK-NEXT: s_not_b32 s0, s4 +; CHECK-NEXT: s_bitcmp1_b32 s0, 0 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s1, 0 ; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; CHECK-NEXT: s_cselect_b64 s[12:13], -1, 0 @@ -40,6 +43,11 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_mov_b64 s[22:23], -1 ; CHECK-NEXT: .LBB0_2: ; %Flow7 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[20:21] +; CHECK-NEXT: v_readfirstlane_b32 s20, v1 +; CHECK-NEXT: s_not_b32 s20, s20 +; CHECK-NEXT: s_bitcmp1_b32 s20, 0 +; CHECK-NEXT: s_cselect_b64 s[24:25], -1, 0 ; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_12 ; CHECK-NEXT: .LBB0_3: ; %bb7 @@ -88,9 +96,8 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_branch .LBB0_2 ; CHECK-NEXT: .LBB0_12: ; %loop.exit.guard6 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: s_xor_b64 s[22:23], s[20:21], -1 ; CHECK-NEXT: s_mov_b64 s[20:21], -1 -; CHECK-NEXT: s_and_b64 vcc, exec, s[22:23] +; CHECK-NEXT: s_and_b64 vcc, exec, s[24:25] ; CHECK-NEXT: s_cbranch_vccz .LBB0_16 ; CHECK-NEXT: ; %bb.13: ; %bb14 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 @@ -98,8 +105,8 @@ define amdgpu_kernel void @cannot_create_empty_or_backwards_segment(i1 %arg, i1 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_15 ; CHECK-NEXT: ; %bb.14: ; %bb15 ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 offset:4 -; CHECK-NEXT: buffer_store_dword v0, off, s[24:27], 0 +; CHECK-NEXT: buffer_store_dword v0, off, s[28:31], 0 offset:4 +; CHECK-NEXT: buffer_store_dword v0, off, s[28:31], 0 ; CHECK-NEXT: .LBB0_15: ; %Flow ; CHECK-NEXT: ; in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: s_mov_b64 s[20:21], 0 diff --git a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll index 41082821bafe3..f2faa576d98c7 100644 --- a/llvm/test/CodeGen/AMDGPU/extract-subvector.ll +++ b/llvm/test/CodeGen/AMDGPU/extract-subvector.ll @@ -5,8 +5,9 @@ define <2 x i16> @extract_2xi16(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-LABEL: extract_2xi16: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr5 ; GCN-NEXT: ; implicit-def: $vgpr4 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc @@ -99,8 +100,9 @@ define <2 x i64> @extract_2xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-LABEL: extract_2xi64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -169,8 +171,9 @@ define <4 x i64> @extract_4xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-LABEL: extract_4xi64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -245,8 +248,9 @@ define <8 x i64> @extract_8xi64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i1 % ; GCN-LABEL: extract_8xi64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -348,8 +352,9 @@ define <2 x double> @extract_2xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-LABEL: extract_2xf64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -418,8 +423,9 @@ define <4 x double> @extract_4xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-LABEL: extract_4xf64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -494,8 +500,9 @@ define <8 x double> @extract_8xf64(ptr addrspace(1) %p0, ptr addrspace(1) %p1, i ; GCN-LABEL: extract_8xf64: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v4, v4 ; GCN-NEXT: v_and_b32_e32 v4, 1, v4 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 ; GCN-NEXT: ; implicit-def: $vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31_vgpr32_vgpr33_vgpr34_vgpr35 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll index c69b0cce3d208..ad1878d034a87 100644 --- a/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/extract_vector_dynelt.ll @@ -85,27 +85,31 @@ define amdgpu_kernel void @double5_extelt(ptr addrspace(1) %out, i32 %sel) { ; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GCN-NEXT: s_mov_b32 s2, 0x3ff028f5 ; GCN-NEXT: s_mov_b32 s3, 0xc28f5c29 +; GCN-NEXT: v_mov_b32_e32 v2, 0x70a3d70a ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_eq_u32 s6, 1 ; GCN-NEXT: s_cselect_b32 s2, s2, 0x3f847ae1 ; GCN-NEXT: s_cselect_b32 s3, s3, 0x47ae147b ; GCN-NEXT: s_cmp_eq_u32 s6, 2 -; GCN-NEXT: s_cselect_b32 s8, 0xe147ae14, s3 -; GCN-NEXT: s_cselect_b32 s7, 0x4000147a, s2 +; GCN-NEXT: s_cselect_b32 s4, 0xe147ae14, s3 +; GCN-NEXT: s_cselect_b32 s5, 0x4000147a, s2 ; GCN-NEXT: s_cmp_eq_u32 s6, 3 ; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN-NEXT: s_and_b64 s[4:5], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s9, 0x40100a3d, s7 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s5, 0x40100a3d, s5 ; GCN-NEXT: s_cmp_eq_u32 s6, 4 -; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s6, 0x40140a3d, s9 -; GCN-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GCN-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s2, 0x70a3d70a, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_cselect_b32 s2, 0x40140a3d, s5 +; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s2 ; GCN-NEXT: v_mov_b32_e32 v2, s0 ; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm @@ -1097,71 +1101,91 @@ define double @double16_extelt_vec(i32 %sel) { ; GCN-LABEL: double16_extelt_vec: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v3, 0x3ff19999 -; GCN-NEXT: v_mov_b32_e32 v4, 0x4000cccc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v0 -; GCN-NEXT: v_mov_b32_e32 v1, 0x9999999a -; GCN-NEXT: v_mov_b32_e32 v2, 0xcccccccd -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x4008cccc -; GCN-NEXT: s_or_b64 vcc, s[4:5], vcc -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v2, v1, v2, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x40106666 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x40146666 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v0 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x40186666 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v5, 0x401c6666 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GCN-NEXT: v_or_b32_e32 v1, v2, v1 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_mov_b32_e32 v2, 0x9999999a +; GCN-NEXT: v_mov_b32_e32 v3, 0xcccccccd +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, v2, v3, s[6:7] +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[8:9] +; GCN-NEXT: v_cmp_eq_u32_e64 s[10:11], 5, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[10:11] +; GCN-NEXT: v_cmp_eq_u32_e64 s[12:13], 6, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[12:13] +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 ; GCN-NEXT: v_mov_b32_e32 v4, 0x66666666 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x40203333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x40223333 -; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 8, v0 -; GCN-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-NEXT: v_mov_b32_e32 v4, 0x40243333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 9, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, 0x40263333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 10, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, 0x40283333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 11, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, 0x402a3333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 12, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v4, 0x402c3333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 13, v0 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_mov_b32_e32 v5, 0x402e3333 -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 14, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 1, v3 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[14:15] +; GCN-NEXT: v_cmp_eq_u32_e64 s[14:15], 7, v0 +; GCN-NEXT: v_cmp_eq_u32_e64 s[16:17], 8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[14:15] +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[16:17] +; GCN-NEXT: v_cmp_eq_u32_e64 s[18:19], 9, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[18:19] +; GCN-NEXT: v_cmp_eq_u32_e64 s[20:21], 10, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[20:21] +; GCN-NEXT: v_cmp_eq_u32_e64 s[22:23], 11, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[22:23] +; GCN-NEXT: v_cmp_eq_u32_e64 s[24:25], 12, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[24:25] +; GCN-NEXT: v_cmp_eq_u32_e64 s[26:27], 13, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[26:27] +; GCN-NEXT: v_cmp_eq_u32_e64 s[28:29], 14, v0 +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[28:29] +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 ; GCN-NEXT: v_mov_b32_e32 v4, 0x33333333 -; GCN-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc -; GCN-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc -; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 15, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v1, 0x40301999 -; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 1, v3 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[40:41] +; GCN-NEXT: v_cmp_eq_u32_e64 s[40:41], 15, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, v2, s[40:41] +; GCN-NEXT: v_mov_b32_e32 v1, 0x3ff19999 +; GCN-NEXT: v_mov_b32_e32 v2, 0x4000cccc +; GCN-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc +; GCN-NEXT: v_mov_b32_e32 v2, 0x4008cccc +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[4:5] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40106666 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[6:7] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40146666 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[8:9] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40186666 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[10:11] +; GCN-NEXT: v_mov_b32_e32 v2, 0x401c6666 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[12:13] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40203333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[14:15] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40223333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[16:17] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40243333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[18:19] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40263333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[20:21] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40283333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[22:23] +; GCN-NEXT: v_mov_b32_e32 v2, 0x402a3333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[24:25] +; GCN-NEXT: v_mov_b32_e32 v2, 0x402c3333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[26:27] +; GCN-NEXT: v_mov_b32_e32 v2, 0x402e3333 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[28:29] +; GCN-NEXT: v_mov_b32_e32 v2, 0x40301999 +; GCN-NEXT: v_cndmask_b32_e64 v1, v1, v2, s[40:41] ; GCN-NEXT: s_setpc_b64 s[30:31] entry: %ext = extractelement <16 x double> , i32 %sel diff --git a/llvm/test/CodeGen/AMDGPU/fceil64.ll b/llvm/test/CodeGen/AMDGPU/fceil64.ll index f34a64c470c4e..5bf609ad4b3d2 100644 --- a/llvm/test/CodeGen/AMDGPU/fceil64.ll +++ b/llvm/test/CodeGen/AMDGPU/fceil64.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=CI -check-prefix=FUNC %s @@ -30,6 +31,42 @@ declare <16 x double> @llvm.ceil.v16f64(<16 x double>) nounwind readnone ; SI: v_add_f64 ; SI: s_endpgm define amdgpu_kernel void @fceil_f64(ptr addrspace(1) %out, double %x) { +; SI-LABEL: fceil_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v3, 0x3ff00000 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_bfe_u32 s4, s7, 0xb0014 +; SI-NEXT: s_and_b32 s10, s7, 0x80000000 +; SI-NEXT: s_add_i32 s11, s4, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[8:9], s11 +; SI-NEXT: s_andn2_b64 s[4:5], s[6:7], s[4:5] +; SI-NEXT: s_cmp_lt_i32 s11, 0 +; SI-NEXT: s_cselect_b32 s4, 0, s4 +; SI-NEXT: s_cselect_b32 s5, s10, s5 +; SI-NEXT: s_cmp_gt_i32 s11, 51 +; SI-NEXT: s_cselect_b32 s5, s7, s5 +; SI-NEXT: s_cselect_b32 s4, s6, s4 +; SI-NEXT: v_cmp_gt_f64_e64 s[8:9], s[6:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[8:9] +; SI-NEXT: v_mov_b32_e32 v1, s4 +; SI-NEXT: v_mov_b32_e32 v2, s5 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[1:2] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v1, v4, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc +; SI-NEXT: v_add_f64 v[0:1], s[4:5], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %y = call double @llvm.ceil.f64(double %x) nounwind readnone store double %y, ptr addrspace(1) %out ret void @@ -39,6 +76,63 @@ define amdgpu_kernel void @fceil_f64(ptr addrspace(1) %out, double %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 define amdgpu_kernel void @fceil_v2f64(ptr addrspace(1) %out, <2 x double> %x) { +; SI-LABEL: fceil_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s9, 0xfffff +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s12, s7, 0xb0014 +; SI-NEXT: s_and_b32 s13, s7, 0x80000000 +; SI-NEXT: v_cmp_gt_f64_e64 s[10:11], s[6:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[10:11] +; SI-NEXT: v_cmp_gt_f64_e64 s[10:11], s[4:5], 0 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[10:11] +; SI-NEXT: s_addk_i32 s12, 0xfc01 +; SI-NEXT: s_lshr_b64 s[10:11], s[8:9], s12 +; SI-NEXT: s_andn2_b64 s[10:11], s[6:7], s[10:11] +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_cselect_b32 s10, 0, s10 +; SI-NEXT: s_cselect_b32 s11, s13, s11 +; SI-NEXT: s_cmp_gt_i32 s12, 51 +; SI-NEXT: s_cselect_b32 s11, s7, s11 +; SI-NEXT: s_cselect_b32 s10, s6, s10 +; SI-NEXT: s_bfe_u32 s12, s5, 0xb0014 +; SI-NEXT: s_and_b32 s13, s5, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v1, s10 +; SI-NEXT: v_mov_b32_e32 v2, s11 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[1:2] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: s_addk_i32 s12, 0xfc01 +; SI-NEXT: v_and_b32_e32 v1, v3, v1 +; SI-NEXT: s_lshr_b64 s[6:7], s[8:9], s12 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: s_andn2_b64 s[6:7], s[4:5], s[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s12, 0 +; SI-NEXT: s_cselect_b32 s6, 0, s6 +; SI-NEXT: s_cselect_b32 s7, s13, s7 +; SI-NEXT: s_cmp_gt_i32 s12, 51 +; SI-NEXT: s_cselect_b32 s7, s5, s7 +; SI-NEXT: s_cselect_b32 s6, s4, s6 +; SI-NEXT: v_add_f64 v[2:3], s[10:11], v[0:1] +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[4:5] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v1, v7, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v6, vcc +; SI-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm %y = call <2 x double> @llvm.ceil.v2f64(<2 x double> %x) nounwind readnone store <2 x double> %y, ptr addrspace(1) %out ret void @@ -60,6 +154,108 @@ define amdgpu_kernel void @fceil_v2f64(ptr addrspace(1) %out, <2 x double> %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 define amdgpu_kernel void @fceil_v4f64(ptr addrspace(1) %out, <4 x double> %x) { +; SI-LABEL: fceil_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_mov_b32 s13, 0xfffff +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: v_mov_b32_e32 v2, 0x3ff00000 +; SI-NEXT: s_mov_b32 s12, s10 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s16, s3, 0xb0014 +; SI-NEXT: s_and_b32 s17, s3, 0x80000000 +; SI-NEXT: v_cmp_gt_f64_e64 s[14:15], s[2:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[14:15] +; SI-NEXT: v_cmp_gt_f64_e64 s[14:15], s[0:1], 0 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[14:15] +; SI-NEXT: v_cmp_gt_f64_e64 s[14:15], s[6:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] +; SI-NEXT: v_cmp_gt_f64_e64 s[14:15], s[4:5], 0 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[14:15] +; SI-NEXT: s_addk_i32 s16, 0xfc01 +; SI-NEXT: s_lshr_b64 s[14:15], s[12:13], s16 +; SI-NEXT: s_andn2_b64 s[14:15], s[2:3], s[14:15] +; SI-NEXT: s_cmp_lt_i32 s16, 0 +; SI-NEXT: s_cselect_b32 s14, 0, s14 +; SI-NEXT: s_cselect_b32 s15, s17, s15 +; SI-NEXT: s_cmp_gt_i32 s16, 51 +; SI-NEXT: s_cselect_b32 s15, s3, s15 +; SI-NEXT: s_cselect_b32 s14, s2, s14 +; SI-NEXT: s_bfe_u32 s16, s1, 0xb0014 +; SI-NEXT: s_and_b32 s17, s1, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v3, s14 +; SI-NEXT: v_mov_b32_e32 v4, s15 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[2:3], v[3:4] +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: s_addk_i32 s16, 0xfc01 +; SI-NEXT: v_and_b32_e32 v1, v1, v3 +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s16 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: s_cmp_lt_i32 s16, 0 +; SI-NEXT: v_add_f64 v[5:6], s[14:15], v[0:1] +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s17, s3 +; SI-NEXT: s_cmp_gt_i32 s16, 51 +; SI-NEXT: s_cselect_b32 s3, s1, s3 +; SI-NEXT: s_cselect_b32 s2, s0, s2 +; SI-NEXT: s_bfe_u32 s14, s7, 0xb0014 +; SI-NEXT: s_and_b32 s15, s7, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v4, s3 +; SI-NEXT: v_mov_b32_e32 v3, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[0:1], v[3:4] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: s_addk_i32 s14, 0xfc01 +; SI-NEXT: v_and_b32_e32 v1, v7, v1 +; SI-NEXT: s_lshr_b64 s[0:1], s[12:13], s14 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: s_andn2_b64 s[0:1], s[6:7], s[0:1] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: s_cmp_lt_i32 s14, 0 +; SI-NEXT: v_add_f64 v[3:4], s[2:3], v[0:1] +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s15, s1 +; SI-NEXT: s_cmp_gt_i32 s14, 51 +; SI-NEXT: s_cselect_b32 s1, s7, s1 +; SI-NEXT: s_cselect_b32 s0, s6, s0 +; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; SI-NEXT: s_and_b32 s14, s5, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v8, s1 +; SI-NEXT: v_mov_b32_e32 v7, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[7:8] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v1, v9, v1 +; SI-NEXT: s_lshr_b64 s[2:3], s[12:13], s6 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s14, s3 +; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b32 s3, s5, s3 +; SI-NEXT: s_cselect_b32 s2, s4, s2 +; SI-NEXT: v_add_f64 v[9:10], s[0:1], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s3 +; SI-NEXT: v_mov_b32_e32 v7, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[7:8] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v1, v11, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; SI-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc +; SI-NEXT: v_add_f64 v[7:8], s[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx4 v[7:10], off, s[8:11], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[3:6], off, s[8:11], 0 +; SI-NEXT: s_endpgm %y = call <4 x double> @llvm.ceil.v4f64(<4 x double> %x) nounwind readnone store <4 x double> %y, ptr addrspace(1) %out ret void @@ -75,6 +271,199 @@ define amdgpu_kernel void @fceil_v4f64(ptr addrspace(1) %out, <4 x double> %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 define amdgpu_kernel void @fceil_v8f64(ptr addrspace(1) %out, <8 x double> %x) { +; SI-LABEL: fceil_v8f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x19 +; SI-NEXT: s_mov_b32 s19, 0xf000 +; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: s_mov_b32 s21, 0xfffff +; SI-NEXT: v_mov_b32_e32 v4, 0 +; SI-NEXT: v_mov_b32_e32 v6, 0x3ff00000 +; SI-NEXT: s_mov_b32 s20, s18 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s24, s3, 0xb0014 +; SI-NEXT: s_and_b32 s25, s3, 0x80000000 +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[2:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[0:1], 0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[6:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[4:5], 0 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[10:11], 0 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[8:9], 0 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[14:15], 0 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[22:23] +; SI-NEXT: v_cmp_gt_f64_e64 s[22:23], s[12:13], 0 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[22:23] +; SI-NEXT: s_addk_i32 s24, 0xfc01 +; SI-NEXT: s_lshr_b64 s[22:23], s[20:21], s24 +; SI-NEXT: s_andn2_b64 s[22:23], s[2:3], s[22:23] +; SI-NEXT: s_cmp_lt_i32 s24, 0 +; SI-NEXT: s_cselect_b32 s22, 0, s22 +; SI-NEXT: s_cselect_b32 s23, s25, s23 +; SI-NEXT: s_cmp_gt_i32 s24, 51 +; SI-NEXT: s_cselect_b32 s23, s3, s23 +; SI-NEXT: s_cselect_b32 s22, s2, s22 +; SI-NEXT: s_bfe_u32 s24, s1, 0xb0014 +; SI-NEXT: s_and_b32 s25, s1, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v2, s22 +; SI-NEXT: v_mov_b32_e32 v3, s23 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[2:3], v[2:3] +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: s_addk_i32 s24, 0xfc01 +; SI-NEXT: v_and_b32_e32 v0, v0, v2 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s24 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s24, 0 +; SI-NEXT: v_add_f64 v[2:3], s[22:23], v[4:5] +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s25, s3 +; SI-NEXT: s_cmp_gt_i32 s24, 51 +; SI-NEXT: s_cselect_b32 s3, s1, s3 +; SI-NEXT: s_cselect_b32 s2, s0, s2 +; SI-NEXT: s_bfe_u32 s22, s7, 0xb0014 +; SI-NEXT: s_and_b32 s23, s7, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v14, s3 +; SI-NEXT: v_mov_b32_e32 v13, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[0:1], v[13:14] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_addk_i32 s22, 0xfc01 +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: s_lshr_b64 s[0:1], s[20:21], s22 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: s_andn2_b64 s[0:1], s[6:7], s[0:1] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s22, 0 +; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s23, s1 +; SI-NEXT: s_cmp_gt_i32 s22, 51 +; SI-NEXT: s_cselect_b32 s1, s7, s1 +; SI-NEXT: s_cselect_b32 s0, s6, s0 +; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; SI-NEXT: s_and_b32 s22, s5, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v14, s1 +; SI-NEXT: v_mov_b32_e32 v13, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[13:14] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v5, v8, v5 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s6 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: v_add_f64 v[15:16], s[0:1], v[4:5] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s22, s3 +; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b32 s1, s5, s1 +; SI-NEXT: s_cselect_b32 s0, s4, s0 +; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 +; SI-NEXT: s_and_b32 s6, s11, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v14, s1 +; SI-NEXT: v_mov_b32_e32 v13, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[13:14] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v5, v10, v5 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s4 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s4, 0 +; SI-NEXT: v_add_f64 v[13:14], s[0:1], v[4:5] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s6, s3 +; SI-NEXT: s_cmp_gt_i32 s4, 51 +; SI-NEXT: s_cselect_b32 s1, s11, s1 +; SI-NEXT: s_cselect_b32 s0, s10, s0 +; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s9, 0x80000000 +; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[16:19], 0 offset:16 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v14, s1 +; SI-NEXT: v_mov_b32_e32 v13, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[13:14] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: s_add_i32 s5, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v5, v7, v5 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s5 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: v_add_f64 v[15:16], s[0:1], v[4:5] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s4, s3 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s1, s9, s1 +; SI-NEXT: s_cselect_b32 s0, s8, s0 +; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 +; SI-NEXT: s_and_b32 s4, s15, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v8, s1 +; SI-NEXT: v_mov_b32_e32 v7, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[7:8] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: s_add_i32 s5, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v5, v9, v5 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s5 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: s_andn2_b64 s[2:3], s[14:15], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: v_add_f64 v[13:14], s[0:1], v[4:5] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s4, s3 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s1, s15, s1 +; SI-NEXT: s_cselect_b32 s0, s14, s0 +; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 +; SI-NEXT: s_and_b32 s4, s13, 0x80000000 +; SI-NEXT: buffer_store_dwordx4 v[13:16], off, s[16:19], 0 offset:32 +; SI-NEXT: v_mov_b32_e32 v8, s1 +; SI-NEXT: v_mov_b32_e32 v7, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[7:8] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: s_add_i32 s5, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v5, v11, v5 +; SI-NEXT: s_lshr_b64 s[2:3], s[20:21], s5 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s4, s3 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s3, s13, s3 +; SI-NEXT: s_cselect_b32 s2, s12, s2 +; SI-NEXT: v_add_f64 v[7:8], s[0:1], v[4:5] +; SI-NEXT: v_mov_b32_e32 v10, s3 +; SI-NEXT: v_mov_b32_e32 v9, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[9:10] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v5, v12, v5 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v5, 0, v6, vcc +; SI-NEXT: v_add_f64 v[5:6], s[2:3], v[4:5] +; SI-NEXT: buffer_store_dwordx4 v[5:8], off, s[16:19], 0 offset:48 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; SI-NEXT: s_endpgm %y = call <8 x double> @llvm.ceil.v8f64(<8 x double> %x) nounwind readnone store <8 x double> %y, ptr addrspace(1) %out ret void @@ -98,7 +487,388 @@ define amdgpu_kernel void @fceil_v8f64(ptr addrspace(1) %out, <8 x double> %x) { ; CI: v_ceil_f64_e32 ; CI: v_ceil_f64_e32 define amdgpu_kernel void @fceil_v16f64(ptr addrspace(1) %out, <16 x double> %x) { +; SI-LABEL: fceil_v16f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[36:37], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx16 s[16:31], s[4:5], 0x29 +; SI-NEXT: s_load_dwordx16 s[0:15], s[4:5], 0x39 +; SI-NEXT: s_mov_b32 s39, 0xf000 +; SI-NEXT: s_mov_b32 s38, -1 +; SI-NEXT: s_mov_b32 s35, 0xfffff +; SI-NEXT: v_mov_b32_e32 v8, 0 +; SI-NEXT: v_mov_b32_e32 v10, 0x3ff00000 +; SI-NEXT: s_mov_b32 s34, s38 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_bfe_u32 s40, s19, 0xb0014 +; SI-NEXT: s_and_b32 s33, s19, 0x80000000 +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[18:19], 0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[16:17], 0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[22:23], 0 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[20:21], 0 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[26:27], 0 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[24:25], 0 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[30:31], 0 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[28:29], 0 +; SI-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[2:3], 0 +; SI-NEXT: v_cndmask_b32_e64 v15, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[0:1], 0 +; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[6:7], 0 +; SI-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[4:5], 0 +; SI-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[42:43] +; SI-NEXT: v_cmp_gt_f64_e64 s[42:43], s[10:11], 0 +; SI-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[42:43] +; SI-NEXT: s_add_i32 s42, s40, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[40:41], s[34:35], s42 +; SI-NEXT: s_andn2_b64 s[40:41], s[18:19], s[40:41] +; SI-NEXT: s_cmp_lt_i32 s42, 0 +; SI-NEXT: s_cselect_b32 s40, 0, s40 +; SI-NEXT: s_cselect_b32 s33, s33, s41 +; SI-NEXT: s_cmp_gt_i32 s42, 51 +; SI-NEXT: s_cselect_b32 s41, s19, s33 +; SI-NEXT: s_cselect_b32 s40, s18, s40 +; SI-NEXT: s_bfe_u32 s33, s17, 0xb0014 +; SI-NEXT: s_and_b32 s42, s17, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v2, s40 +; SI-NEXT: v_mov_b32_e32 v3, s41 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[18:19], v[2:3] +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: s_addk_i32 s33, 0xfc01 +; SI-NEXT: v_and_b32_e32 v0, v0, v2 +; SI-NEXT: s_lshr_b64 s[18:19], s[34:35], s33 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: s_andn2_b64 s[18:19], s[16:17], s[18:19] +; SI-NEXT: s_cmp_lt_i32 s33, 0 +; SI-NEXT: s_cselect_b32 s19, s42, s19 +; SI-NEXT: s_cselect_b32 s18, 0, s18 +; SI-NEXT: s_cmp_gt_i32 s33, 51 +; SI-NEXT: s_cselect_b32 s19, s17, s19 +; SI-NEXT: s_cselect_b32 s18, s16, s18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[16:17], v[2:3] +; SI-NEXT: v_add_f64 v[2:3], s[40:41], v[8:9] +; SI-NEXT: s_bfe_u32 s16, s23, 0xb0014 +; SI-NEXT: s_and_b32 s33, s23, 0x80000000 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: s_add_i32 s40, s16, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s40 +; SI-NEXT: s_andn2_b64 s[16:17], s[22:23], s[16:17] +; SI-NEXT: s_cmp_lt_i32 s40, 0 +; SI-NEXT: v_add_f64 v[0:1], s[18:19], v[8:9] +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s33, s17 +; SI-NEXT: s_cmp_gt_i32 s40, 51 +; SI-NEXT: s_cselect_b32 s17, s23, s17 +; SI-NEXT: s_cselect_b32 s16, s22, s16 +; SI-NEXT: s_bfe_u32 s18, s21, 0xb0014 +; SI-NEXT: s_and_b32 s33, s21, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[22:23], v[6:7] +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: s_add_i32 s22, s18, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v4, v4, v6 +; SI-NEXT: s_lshr_b64 s[18:19], s[34:35], s22 +; SI-NEXT: v_and_b32_e32 v4, 1, v4 +; SI-NEXT: s_andn2_b64 s[18:19], s[20:21], s[18:19] +; SI-NEXT: s_cmp_lt_i32 s22, 0 +; SI-NEXT: s_cselect_b32 s19, s33, s19 +; SI-NEXT: s_cselect_b32 s18, 0, s18 +; SI-NEXT: s_cmp_gt_i32 s22, 51 +; SI-NEXT: s_cselect_b32 s19, s21, s19 +; SI-NEXT: s_cselect_b32 s18, s20, s18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[20:21], v[6:7] +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v4, 1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; SI-NEXT: v_add_f64 v[6:7], s[16:17], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_add_f64 v[4:5], s[18:19], v[8:9] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[36:39], 0 offset:16 +; SI-NEXT: v_cmp_gt_f64_e64 s[16:17], s[8:9], 0 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[16:17] +; SI-NEXT: v_cmp_gt_f64_e64 s[16:17], s[14:15], 0 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[16:17] +; SI-NEXT: s_bfe_u32 s16, s27, 0xb0014 +; SI-NEXT: s_and_b32 s18, s27, 0x80000000 +; SI-NEXT: s_add_i32 s19, s16, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s19 +; SI-NEXT: s_andn2_b64 s[16:17], s[26:27], s[16:17] +; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s18, s17 +; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cselect_b32 s17, s27, s17 +; SI-NEXT: s_cselect_b32 s16, s26, s16 +; SI-NEXT: s_bfe_u32 s18, s25, 0xb0014 +; SI-NEXT: s_and_b32 s20, s25, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v6, s16 +; SI-NEXT: v_mov_b32_e32 v7, s17 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[26:27], v[6:7] +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: s_add_i32 s21, s18, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v6, v11, v6 +; SI-NEXT: s_lshr_b64 s[18:19], s[34:35], s21 +; SI-NEXT: v_and_b32_e32 v6, 1, v6 +; SI-NEXT: s_andn2_b64 s[18:19], s[24:25], s[18:19] +; SI-NEXT: s_cmp_lt_i32 s21, 0 +; SI-NEXT: s_cselect_b32 s19, s20, s19 +; SI-NEXT: s_cselect_b32 s18, 0, s18 +; SI-NEXT: s_cmp_gt_i32 s21, 51 +; SI-NEXT: s_cselect_b32 s19, s25, s19 +; SI-NEXT: s_cselect_b32 s18, s24, s18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v6, s18 +; SI-NEXT: v_mov_b32_e32 v7, s19 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[24:25], v[6:7] +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v6, v12, v6 +; SI-NEXT: v_and_b32_e32 v6, 1, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; SI-NEXT: v_add_f64 v[22:23], s[16:17], v[8:9] +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_add_f64 v[20:21], s[18:19], v[8:9] +; SI-NEXT: buffer_store_dwordx4 v[20:23], off, s[36:39], 0 offset:32 +; SI-NEXT: v_cmp_gt_f64_e64 s[16:17], s[12:13], 0 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[16:17] +; SI-NEXT: s_bfe_u32 s16, s31, 0xb0014 +; SI-NEXT: s_and_b32 s18, s31, 0x80000000 +; SI-NEXT: s_add_i32 s19, s16, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s19 +; SI-NEXT: s_andn2_b64 s[16:17], s[30:31], s[16:17] +; SI-NEXT: s_cmp_lt_i32 s19, 0 +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s18, s17 +; SI-NEXT: s_cmp_gt_i32 s19, 51 +; SI-NEXT: s_cselect_b32 s17, s31, s17 +; SI-NEXT: s_cselect_b32 s16, s30, s16 +; SI-NEXT: s_bfe_u32 s18, s29, 0xb0014 +; SI-NEXT: s_and_b32 s20, s29, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[30:31], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_add_i32 s21, s18, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v7, v13, v7 +; SI-NEXT: s_lshr_b64 s[18:19], s[34:35], s21 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[18:19], s[28:29], s[18:19] +; SI-NEXT: s_cmp_lt_i32 s21, 0 +; SI-NEXT: s_cselect_b32 s19, s20, s19 +; SI-NEXT: s_cselect_b32 s18, 0, s18 +; SI-NEXT: s_cmp_gt_i32 s21, 51 +; SI-NEXT: s_cselect_b32 s19, s29, s19 +; SI-NEXT: s_cselect_b32 s18, s28, s18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_mov_b32_e32 v11, s18 +; SI-NEXT: v_mov_b32_e32 v12, s19 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[28:29], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v7, v14, v7 +; SI-NEXT: v_add_f64 v[13:14], s[16:17], v[8:9] +; SI-NEXT: s_bfe_u32 s16, s3, 0xb0014 +; SI-NEXT: s_and_b32 s20, s3, 0x80000000 +; SI-NEXT: s_add_i32 s21, s16, 0xfffffc01 +; SI-NEXT: s_lshr_b64 s[16:17], s[34:35], s21 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[16:17], s[2:3], s[16:17] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s21, 0 +; SI-NEXT: v_add_f64 v[11:12], s[18:19], v[8:9] +; SI-NEXT: s_cselect_b32 s16, 0, s16 +; SI-NEXT: s_cselect_b32 s17, s20, s17 +; SI-NEXT: s_cmp_gt_i32 s21, 51 +; SI-NEXT: s_cselect_b32 s17, s3, s17 +; SI-NEXT: s_cselect_b32 s16, s2, s16 +; SI-NEXT: s_bfe_u32 s18, s1, 0xb0014 +; SI-NEXT: s_and_b32 s19, s1, 0x80000000 +; SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[36:39], 0 offset:48 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v11, s16 +; SI-NEXT: v_mov_b32_e32 v12, s17 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[2:3], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_addk_i32 s18, 0xfc01 +; SI-NEXT: v_and_b32_e32 v7, v15, v7 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s18 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[2:3], s[0:1], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s18, 0 +; SI-NEXT: v_add_f64 v[13:14], s[16:17], v[8:9] +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s19, s3 +; SI-NEXT: s_cmp_gt_i32 s18, 51 +; SI-NEXT: s_cselect_b32 s3, s1, s3 +; SI-NEXT: s_cselect_b32 s2, s0, s2 +; SI-NEXT: s_bfe_u32 s16, s7, 0xb0014 +; SI-NEXT: s_and_b32 s17, s7, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v12, s3 +; SI-NEXT: v_mov_b32_e32 v11, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[0:1], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_addk_i32 s16, 0xfc01 +; SI-NEXT: v_and_b32_e32 v7, v16, v7 +; SI-NEXT: s_lshr_b64 s[0:1], s[34:35], s16 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[0:1], s[6:7], s[0:1] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s16, 0 +; SI-NEXT: v_add_f64 v[11:12], s[2:3], v[8:9] +; SI-NEXT: s_cselect_b32 s0, 0, s0 +; SI-NEXT: s_cselect_b32 s1, s17, s1 +; SI-NEXT: s_cmp_gt_i32 s16, 51 +; SI-NEXT: s_cselect_b32 s1, s7, s1 +; SI-NEXT: s_cselect_b32 s0, s6, s0 +; SI-NEXT: s_bfe_u32 s2, s5, 0xb0014 +; SI-NEXT: s_and_b32 s16, s5, 0x80000000 +; SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[36:39], 0 offset:64 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[6:7], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_add_i32 s6, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v7, v17, v7 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s6 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[2:3], s[4:5], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s6, 0 +; SI-NEXT: v_add_f64 v[13:14], s[0:1], v[8:9] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s16, s3 +; SI-NEXT: s_cmp_gt_i32 s6, 51 +; SI-NEXT: s_cselect_b32 s1, s5, s1 +; SI-NEXT: s_cselect_b32 s0, s4, s0 +; SI-NEXT: s_bfe_u32 s2, s11, 0xb0014 +; SI-NEXT: s_and_b32 s6, s11, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[4:5], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_add_i32 s4, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v7, v18, v7 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s4 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[2:3], s[10:11], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s4, 0 +; SI-NEXT: v_add_f64 v[11:12], s[0:1], v[8:9] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s6, s3 +; SI-NEXT: s_cmp_gt_i32 s4, 51 +; SI-NEXT: s_cselect_b32 s1, s11, s1 +; SI-NEXT: s_cselect_b32 s0, s10, s0 +; SI-NEXT: s_bfe_u32 s2, s9, 0xb0014 +; SI-NEXT: s_and_b32 s4, s9, 0x80000000 +; SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[36:39], 0 offset:80 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[10:11], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_add_i32 s5, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v7, v19, v7 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s5 +; SI-NEXT: v_and_b32_e32 v7, 1, v7 +; SI-NEXT: s_andn2_b64 s[2:3], s[8:9], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: v_add_f64 v[13:14], s[0:1], v[8:9] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s4, s3 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s1, s9, s1 +; SI-NEXT: s_cselect_b32 s0, s8, s0 +; SI-NEXT: s_bfe_u32 s2, s15, 0xb0014 +; SI-NEXT: s_and_b32 s4, s15, 0x80000000 +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[8:9], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: s_add_i32 s5, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v5, v5, v7 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s5 +; SI-NEXT: v_and_b32_e32 v5, 1, v5 +; SI-NEXT: s_andn2_b64 s[2:3], s[14:15], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: v_add_f64 v[11:12], s[0:1], v[8:9] +; SI-NEXT: s_cselect_b32 s0, 0, s2 +; SI-NEXT: s_cselect_b32 s1, s4, s3 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s1, s15, s1 +; SI-NEXT: s_cselect_b32 s0, s14, s0 +; SI-NEXT: s_bfe_u32 s2, s13, 0xb0014 +; SI-NEXT: s_and_b32 s4, s13, 0x80000000 +; SI-NEXT: buffer_store_dwordx4 v[11:14], off, s[36:39], 0 offset:96 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v12, s1 +; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[14:15], v[11:12] +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: s_add_i32 s5, s2, 0xfffffc01 +; SI-NEXT: v_and_b32_e32 v4, v4, v5 +; SI-NEXT: s_lshr_b64 s[2:3], s[34:35], s5 +; SI-NEXT: v_and_b32_e32 v4, 1, v4 +; SI-NEXT: s_andn2_b64 s[2:3], s[12:13], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: s_cmp_lt_i32 s5, 0 +; SI-NEXT: s_cselect_b32 s2, 0, s2 +; SI-NEXT: s_cselect_b32 s3, s4, s3 +; SI-NEXT: s_cmp_gt_i32 s5, 51 +; SI-NEXT: s_cselect_b32 s3, s13, s3 +; SI-NEXT: s_cselect_b32 s2, s12, s2 +; SI-NEXT: v_add_f64 v[11:12], s[0:1], v[8:9] +; SI-NEXT: v_mov_b32_e32 v5, s3 +; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_cmp_lg_f64_e32 vcc, s[12:13], v[4:5] +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v4, v6, v4 +; SI-NEXT: v_and_b32_e32 v4, 1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; SI-NEXT: v_cndmask_b32_e32 v9, 0, v10, vcc +; SI-NEXT: v_add_f64 v[9:10], s[2:3], v[8:9] +; SI-NEXT: buffer_store_dwordx4 v[9:12], off, s[36:39], 0 offset:112 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[36:39], 0 +; SI-NEXT: s_endpgm %y = call <16 x double> @llvm.ceil.v16f64(<16 x double> %x) nounwind readnone store <16 x double> %y, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CI: {{.*}} +; FUNC: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll index d8c7e335e73ea..9c904275613dd 100644 --- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=hawaii -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=GCN %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=GCN %s @@ -30,6 +31,43 @@ ; GCN: buffer_store_dwordx2 [[RESULT]] ; GCN: s_endpgm define amdgpu_kernel void @fdiv_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: fdiv_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s8, s2 +; SI-NEXT: s_mov_b32 s9, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 offset:8 glc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_div_scale_f64 v[4:5], s[2:3], v[2:3], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[10:11], s[2:3], v[0:1], v[2:3], v[0:1] +; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 +; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-NEXT: v_xor_b32_e32 v12, v13, v12 +; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-NEXT: v_and_b32_e32 v12, 1, v12 +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: s_endpgm %gep.1 = getelementptr double, ptr addrspace(1) %in, i32 1 %num = load volatile double, ptr addrspace(1) %in %den = load volatile double, ptr addrspace(1) %gep.1 @@ -49,6 +87,31 @@ define amdgpu_kernel void @fdiv_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GCN: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] ; GCN: s_setpc_b64 define double @v_fdiv_f64_afn(double %x, double %y) #0 { +; CI-LABEL: v_fdiv_f64_afn: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; CI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; CI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; CI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; CI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; CI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: v_fdiv_f64_afn: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; SI-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[6:7], v[4:5], v[4:5] +; SI-NEXT: v_mul_f64 v[6:7], v[0:1], v[4:5] +; SI-NEXT: v_fma_f64 v[0:1], -v[2:3], v[6:7], v[0:1] +; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[4:5], v[6:7] +; SI-NEXT: s_setpc_b64 s[30:31] %result = fdiv afn double %x, %y ret double %result } @@ -63,12 +126,69 @@ define double @v_fdiv_f64_afn(double %x, double %y) #0 { ; GCN: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] ; GCN: s_setpc_b64 define double @v_rcp_f64_afn(double %x) #0 { +; CI-LABEL: v_rcp_f64_afn: +; CI: ; %bb.0: +; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CI-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; CI-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; CI-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; CI-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; CI-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; CI-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; CI-NEXT: s_setpc_b64 s[30:31] +; +; SI-LABEL: v_rcp_f64_afn: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_rcp_f64_e32 v[2:3], v[0:1] +; SI-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-NEXT: v_fma_f64 v[4:5], -v[0:1], v[2:3], 1.0 +; SI-NEXT: v_fma_f64 v[2:3], v[4:5], v[2:3], v[2:3] +; SI-NEXT: v_fma_f64 v[0:1], -v[0:1], v[2:3], 1.0 +; SI-NEXT: v_fma_f64 v[0:1], v[0:1], v[2:3], v[2:3] +; SI-NEXT: s_setpc_b64 s[30:31] %result = fdiv afn double 1.0, %x ret double %result } ; GCN-LABEL: {{^}}fdiv_f64_s_v: define amdgpu_kernel void @fdiv_f64_s_v(ptr addrspace(1) %out, ptr addrspace(1) %in, double %num) #0 { +; SI-LABEL: fdiv_f64_s_v: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_div_scale_f64 v[2:3], s[2:3], s[6:7], s[6:7], v[0:1] +; SI-NEXT: v_mov_b32_e32 v6, s6 +; SI-NEXT: v_mov_b32_e32 v7, s7 +; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-NEXT: v_div_scale_f64 v[6:7], s[2:3], s[4:5], v[6:7], s[4:5] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s7, v3 +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s5, v7 +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[2:3], s[6:7], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %den = load double, ptr addrspace(1) %in %result = fdiv double %num, %den store double %result, ptr addrspace(1) %out @@ -77,6 +197,40 @@ define amdgpu_kernel void @fdiv_f64_s_v(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-LABEL: {{^}}fdiv_f64_v_s: define amdgpu_kernel void @fdiv_f64_v_s(ptr addrspace(1) %out, ptr addrspace(1) %in, double %den) #0 { +; SI-LABEL: fdiv_f64_v_s: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; SI-NEXT: v_mov_b32_e32 v7, s5 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_div_scale_f64 v[2:3], s[6:7], s[4:5], s[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[6:7], s[6:7], s[2:3], v[6:7], s[2:3] +; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s3, v7 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_xor_b32_e32 v10, v10, v11 +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %num = load double, ptr addrspace(1) %in %result = fdiv double %num, %den store double %result, ptr addrspace(1) %out @@ -85,6 +239,38 @@ define amdgpu_kernel void @fdiv_f64_v_s(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-LABEL: {{^}}fdiv_f64_s_s: define amdgpu_kernel void @fdiv_f64_s_s(ptr addrspace(1) %out, double %num, double %den) #0 { +; SI-LABEL: fdiv_f64_s_s: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_div_scale_f64 v[2:3], s[6:7], s[4:5], s[4:5], v[0:1] +; SI-NEXT: v_mov_b32_e32 v7, s5 +; SI-NEXT: v_mov_b32_e32 v6, s4 +; SI-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] +; SI-NEXT: v_div_scale_f64 v[6:7], s[6:7], s[2:3], v[6:7], s[2:3] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s5, v3 +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s3, v7 +; SI-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[2:3], s[4:5], v[0:1] +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = fdiv double %num, %den store double %result, ptr addrspace(1) %out ret void @@ -92,6 +278,62 @@ define amdgpu_kernel void @fdiv_f64_s_s(ptr addrspace(1) %out, double %num, doub ; GCN-LABEL: {{^}}v_fdiv_v2f64: define amdgpu_kernel void @v_fdiv_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_fdiv_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_div_scale_f64 v[2:3], s[10:11], s[6:7], s[6:7], v[0:1] +; SI-NEXT: v_mov_b32_e32 v7, s1 +; SI-NEXT: v_mov_b32_e32 v6, s0 +; SI-NEXT: v_rcp_f64_e32 v[8:9], v[2:3] +; SI-NEXT: v_div_scale_f64 v[12:13], s[12:13], s[4:5], s[4:5], v[6:7] +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_fma_f64 v[14:15], -v[2:3], v[8:9], 1.0 +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; SI-NEXT: v_rcp_f64_e32 v[14:15], v[12:13] +; SI-NEXT: v_div_scale_f64 v[4:5], s[12:13], s[2:3], v[4:5], s[2:3] +; SI-NEXT: v_fma_f64 v[16:17], -v[2:3], v[8:9], 1.0 +; SI-NEXT: v_mov_b32_e32 v11, s5 +; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[16:17], v[8:9] +; SI-NEXT: v_fma_f64 v[16:17], -v[12:13], v[14:15], 1.0 +; SI-NEXT: v_mov_b32_e32 v10, s4 +; SI-NEXT: v_fma_f64 v[14:15], v[14:15], v[16:17], v[14:15] +; SI-NEXT: v_mul_f64 v[16:17], v[4:5], v[8:9] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s7, v3 +; SI-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s3, v5 +; SI-NEXT: v_fma_f64 v[2:3], -v[2:3], v[16:17], v[4:5] +; SI-NEXT: v_fma_f64 v[4:5], -v[12:13], v[14:15], 1.0 +; SI-NEXT: v_div_scale_f64 v[10:11], s[2:3], s[0:1], v[10:11], s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v18, v19, v18 +; SI-NEXT: v_fma_f64 v[4:5], v[14:15], v[4:5], v[14:15] +; SI-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-NEXT: v_mul_f64 v[14:15], v[10:11], v[4:5] +; SI-NEXT: s_mov_b32 s11, 0xf000 +; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_nop 0 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[8:9], v[16:17] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s5, v13 +; SI-NEXT: v_fma_f64 v[8:9], -v[12:13], v[14:15], v[10:11] +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s1, v11 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], s[6:7], v[0:1] +; SI-NEXT: s_nop 2 +; SI-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[14:15] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[4:5], s[4:5], v[6:7] +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: s_endpgm %gep.1 = getelementptr <2 x double>, ptr addrspace(1) %in, i32 1 %num = load <2 x double>, ptr addrspace(1) %in %den = load <2 x double>, ptr addrspace(1) %gep.1 @@ -102,6 +344,62 @@ define amdgpu_kernel void @v_fdiv_v2f64(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-LABEL: {{^}}s_fdiv_v2f64: define amdgpu_kernel void @s_fdiv_v2f64(ptr addrspace(1) %out, <2 x double> %num, <2 x double> %den) { +; SI-LABEL: s_fdiv_v2f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], s[14:15], s[14:15], v[0:1] +; SI-NEXT: v_mov_b32_e32 v6, s8 +; SI-NEXT: v_mov_b32_e32 v7, s9 +; SI-NEXT: v_rcp_f64_e32 v[8:9], v[4:5] +; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], s[12:13], s[12:13], v[6:7] +; SI-NEXT: v_mov_b32_e32 v2, s14 +; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[8:9], 1.0 +; SI-NEXT: v_mov_b32_e32 v3, s15 +; SI-NEXT: v_div_scale_f64 v[2:3], s[0:1], s[10:11], v[2:3], s[10:11] +; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] +; SI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[8:9], 1.0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s15, v5 +; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 +; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[14:15], v[8:9] +; SI-NEXT: v_fma_f64 v[14:15], -v[10:11], v[12:13], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v18, v17, v16 +; SI-NEXT: v_mul_f64 v[16:17], v[2:3], v[8:9] +; SI-NEXT: v_fma_f64 v[12:13], v[12:13], v[14:15], v[12:13] +; SI-NEXT: v_mov_b32_e32 v15, s13 +; SI-NEXT: v_mov_b32_e32 v14, s12 +; SI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[16:17], v[2:3] +; SI-NEXT: v_fma_f64 v[4:5], -v[10:11], v[12:13], 1.0 +; SI-NEXT: v_div_scale_f64 v[14:15], s[0:1], s[8:9], v[14:15], s[8:9] +; SI-NEXT: v_fma_f64 v[4:5], v[12:13], v[4:5], v[12:13] +; SI-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-NEXT: v_mul_f64 v[12:13], v[14:15], v[4:5] +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[8:9], v[16:17] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s13, v11 +; SI-NEXT: v_fma_f64 v[8:9], -v[10:11], v[12:13], v[14:15] +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s9, v15 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], s[14:15], v[0:1] +; SI-NEXT: s_nop 2 +; SI-NEXT: v_div_fmas_f64 v[4:5], v[8:9], v[4:5], v[12:13] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[4:5], s[12:13], v[6:7] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = fdiv <2 x double> %num, %den store <2 x double> %result, ptr addrspace(1) %out ret void @@ -109,6 +407,107 @@ define amdgpu_kernel void @s_fdiv_v2f64(ptr addrspace(1) %out, <2 x double> %num ; GCN-LABEL: {{^}}v_fdiv_v4f64: define amdgpu_kernel void @v_fdiv_v4f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_fdiv_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[16:19], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_load_dwordx16 s[0:15], s[18:19], 0x0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s2 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v2, s10 +; SI-NEXT: v_div_scale_f64 v[6:7], s[18:19], s[10:11], s[10:11], v[0:1] +; SI-NEXT: v_mov_b32_e32 v3, s11 +; SI-NEXT: v_div_scale_f64 v[2:3], s[18:19], s[2:3], v[2:3], s[2:3] +; SI-NEXT: v_rcp_f64_e32 v[12:13], v[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s11, v7 +; SI-NEXT: v_mov_b32_e32 v9, s1 +; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s3, v3 +; SI-NEXT: v_mov_b32_e32 v8, s0 +; SI-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; SI-NEXT: v_div_scale_f64 v[10:11], s[18:19], s[8:9], s[8:9], v[8:9] +; SI-NEXT: v_xor_b32_e32 v18, v17, v16 +; SI-NEXT: v_fma_f64 v[16:17], -v[6:7], v[12:13], 1.0 +; SI-NEXT: v_rcp_f64_e32 v[14:15], v[10:11] +; SI-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; SI-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-NEXT: v_fma_f64 v[16:17], -v[6:7], v[12:13], 1.0 +; SI-NEXT: v_mov_b32_e32 v4, s6 +; SI-NEXT: v_fma_f64 v[12:13], v[12:13], v[16:17], v[12:13] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; SI-NEXT: v_mul_f64 v[16:17], v[2:3], v[12:13] +; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; SI-NEXT: v_fma_f64 v[2:3], -v[6:7], v[16:17], v[2:3] +; SI-NEXT: v_div_scale_f64 v[6:7], s[2:3], s[14:15], s[14:15], v[4:5] +; SI-NEXT: v_fma_f64 v[18:19], -v[10:11], v[14:15], 1.0 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[12:13], v[16:17] +; SI-NEXT: v_fma_f64 v[12:13], v[14:15], v[18:19], v[14:15] +; SI-NEXT: v_rcp_f64_e32 v[14:15], v[6:7] +; SI-NEXT: v_mov_b32_e32 v17, s9 +; SI-NEXT: v_mov_b32_e32 v16, s8 +; SI-NEXT: v_div_scale_f64 v[16:17], s[2:3], s[0:1], v[16:17], s[0:1] +; SI-NEXT: v_fma_f64 v[18:19], -v[6:7], v[14:15], 1.0 +; SI-NEXT: s_mov_b32 s19, 0xf000 +; SI-NEXT: v_fma_f64 v[14:15], v[14:15], v[18:19], v[14:15] +; SI-NEXT: v_mul_f64 v[18:19], v[16:17], v[12:13] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s1, v17 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s9, v11 +; SI-NEXT: v_fma_f64 v[16:17], -v[10:11], v[18:19], v[16:17] +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[14:15], 1.0 +; SI-NEXT: s_mov_b32 s18, -1 +; SI-NEXT: v_fma_f64 v[10:11], v[14:15], v[10:11], v[14:15] +; SI-NEXT: v_mov_b32_e32 v14, s14 +; SI-NEXT: v_mov_b32_e32 v15, s15 +; SI-NEXT: v_div_scale_f64 v[14:15], s[0:1], s[6:7], v[14:15], s[6:7] +; SI-NEXT: v_div_fmas_f64 v[12:13], v[16:17], v[12:13], v[18:19] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s15, v7 +; SI-NEXT: v_mul_f64 v[16:17], v[14:15], v[10:11] +; SI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], s[10:11], v[0:1] +; SI-NEXT: v_fma_f64 v[18:19], -v[6:7], v[16:17], v[14:15] +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s7, v15 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v6, v7, v6 +; SI-NEXT: v_mov_b32_e32 v15, s5 +; SI-NEXT: v_and_b32_e32 v6, 1, v6 +; SI-NEXT: v_mov_b32_e32 v14, s4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], s[12:13], s[12:13], v[14:15] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[12:13], s[8:9], v[8:9] +; SI-NEXT: s_nop 1 +; SI-NEXT: v_div_fmas_f64 v[10:11], v[18:19], v[10:11], v[16:17] +; SI-NEXT: v_rcp_f64_e32 v[16:17], v[6:7] +; SI-NEXT: v_fma_f64 v[18:19], -v[6:7], v[16:17], 1.0 +; SI-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] +; SI-NEXT: v_fma_f64 v[18:19], -v[6:7], v[16:17], 1.0 +; SI-NEXT: v_fma_f64 v[16:17], v[16:17], v[18:19], v[16:17] +; SI-NEXT: v_mov_b32_e32 v19, s13 +; SI-NEXT: v_mov_b32_e32 v18, s12 +; SI-NEXT: v_div_scale_f64 v[18:19], s[0:1], s[4:5], v[18:19], s[4:5] +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s13, v7 +; SI-NEXT: v_mul_f64 v[20:21], v[18:19], v[16:17] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s5, v19 +; SI-NEXT: v_fma_f64 v[18:19], -v[6:7], v[20:21], v[18:19] +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v6, 1, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; SI-NEXT: v_div_fixup_f64 v[6:7], v[10:11], s[14:15], v[4:5] +; SI-NEXT: s_nop 2 +; SI-NEXT: v_div_fmas_f64 v[16:17], v[18:19], v[16:17], v[20:21] +; SI-NEXT: v_div_fixup_f64 v[4:5], v[16:17], s[12:13], v[14:15] +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[16:19], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0 +; SI-NEXT: s_endpgm %gep.1 = getelementptr <4 x double>, ptr addrspace(1) %in, i32 1 %num = load <4 x double>, ptr addrspace(1) %in %den = load <4 x double>, ptr addrspace(1) %gep.1 @@ -119,6 +518,108 @@ define amdgpu_kernel void @v_fdiv_v4f64(ptr addrspace(1) %out, ptr addrspace(1) ; GCN-LABEL: {{^}}s_fdiv_v4f64: define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num, <4 x double> %den) #0 { +; SI-LABEL: s_fdiv_v4f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s10 +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], s[18:19], s[18:19], v[0:1] +; SI-NEXT: v_mov_b32_e32 v8, s8 +; SI-NEXT: v_mov_b32_e32 v2, s18 +; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] +; SI-NEXT: v_mov_b32_e32 v9, s9 +; SI-NEXT: v_mov_b32_e32 v3, s19 +; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], s[16:17], s[16:17], v[8:9] +; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[6:7], 1.0 +; SI-NEXT: v_div_scale_f64 v[2:3], s[0:1], s[10:11], v[2:3], s[10:11] +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[12:13], v[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s19, v5 +; SI-NEXT: v_rcp_f64_e32 v[12:13], v[10:11] +; SI-NEXT: v_fma_f64 v[14:15], -v[4:5], v[6:7], 1.0 +; SI-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s11, v3 +; SI-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-NEXT: v_xor_b32_e32 v16, v17, v16 +; SI-NEXT: v_and_b32_e32 v18, 1, v16 +; SI-NEXT: v_mul_f64 v[14:15], v[2:3], v[6:7] +; SI-NEXT: v_fma_f64 v[16:17], -v[10:11], v[12:13], 1.0 +; SI-NEXT: v_fma_f64 v[2:3], -v[4:5], v[14:15], v[2:3] +; SI-NEXT: v_fma_f64 v[4:5], v[12:13], v[16:17], v[12:13] +; SI-NEXT: v_mov_b32_e32 v17, s15 +; SI-NEXT: v_mov_b32_e32 v16, s14 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-NEXT: v_div_scale_f64 v[18:19], s[0:1], s[22:23], s[22:23], v[16:17] +; SI-NEXT: v_fma_f64 v[12:13], -v[10:11], v[4:5], 1.0 +; SI-NEXT: v_fma_f64 v[4:5], v[4:5], v[12:13], v[4:5] +; SI-NEXT: s_nop 0 +; SI-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[6:7], v[14:15] +; SI-NEXT: v_rcp_f64_e32 v[6:7], v[18:19] +; SI-NEXT: v_mov_b32_e32 v12, s16 +; SI-NEXT: v_mov_b32_e32 v13, s17 +; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], s[8:9], v[12:13], s[8:9] +; SI-NEXT: v_fma_f64 v[14:15], -v[18:19], v[6:7], 1.0 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s17, v11 +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s9, v13 +; SI-NEXT: v_fma_f64 v[12:13], -v[10:11], v[14:15], v[12:13] +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: v_fma_f64 v[10:11], -v[18:19], v[6:7], 1.0 +; SI-NEXT: v_div_fixup_f64 v[2:3], v[2:3], s[18:19], v[0:1] +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-NEXT: v_mov_b32_e32 v10, s22 +; SI-NEXT: v_mov_b32_e32 v11, s23 +; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], s[14:15], v[10:11], s[14:15] +; SI-NEXT: v_div_fmas_f64 v[12:13], v[12:13], v[4:5], v[14:15] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s23, v19 +; SI-NEXT: v_mul_f64 v[4:5], v[10:11], v[6:7] +; SI-NEXT: v_fma_f64 v[14:15], -v[18:19], v[4:5], v[10:11] +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s15, v11 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: v_mov_b32_e32 v10, s12 +; SI-NEXT: v_mov_b32_e32 v11, s13 +; SI-NEXT: v_div_scale_f64 v[18:19], s[0:1], s[20:21], s[20:21], v[10:11] +; SI-NEXT: s_nop 0 +; SI-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[6:7], v[4:5] +; SI-NEXT: v_div_fixup_f64 v[0:1], v[12:13], s[16:17], v[8:9] +; SI-NEXT: v_rcp_f64_e32 v[6:7], v[18:19] +; SI-NEXT: v_fma_f64 v[14:15], -v[18:19], v[6:7], 1.0 +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-NEXT: v_fma_f64 v[14:15], -v[18:19], v[6:7], 1.0 +; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[14:15], v[6:7] +; SI-NEXT: v_mov_b32_e32 v14, s20 +; SI-NEXT: v_mov_b32_e32 v15, s21 +; SI-NEXT: v_div_scale_f64 v[14:15], s[0:1], s[12:13], v[14:15], s[12:13] +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], s21, v19 +; SI-NEXT: v_mul_f64 v[20:21], v[14:15], v[6:7] +; SI-NEXT: v_cmp_eq_u32_e32 vcc, s13, v15 +; SI-NEXT: v_fma_f64 v[14:15], -v[18:19], v[20:21], v[14:15] +; SI-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v18, v19, v18 +; SI-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_nop 2 +; SI-NEXT: v_div_fmas_f64 v[14:15], v[14:15], v[6:7], v[20:21] +; SI-NEXT: v_div_fixup_f64 v[6:7], v[4:5], s[22:23], v[16:17] +; SI-NEXT: v_div_fixup_f64 v[4:5], v[14:15], s[20:21], v[10:11] +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = fdiv <4 x double> %num, %den store <4 x double> %result, ptr addrspace(1) %out ret void @@ -128,6 +629,16 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5 ; GCN: buffer_store_dwordx2 [[MUL]] define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 { +; SI-LABEL: div_fast_2_x_pat_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f64 v[0:1], s[6:7], 0.5 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = load double, ptr addrspace(1) poison %rcp = fdiv fast double %x, 2.0 store double %rcp, ptr addrspace(1) %out, align 4 @@ -140,6 +651,18 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 { ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]] ; GCN: buffer_store_dwordx2 [[MUL]] define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 { +; SI-LABEL: div_fast_k_x_pat_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_mov_b32_e32 v0, 0x9999999a +; SI-NEXT: v_mov_b32_e32 v1, 0x3fb99999 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = load double, ptr addrspace(1) poison %rcp = fdiv fast double %x, 10.0 store double %rcp, ptr addrspace(1) %out, align 4 @@ -152,6 +675,18 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 { ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]] ; GCN: buffer_store_dwordx2 [[MUL]] define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 { +; SI-LABEL: div_fast_neg_k_x_pat_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: v_mov_b32_e32 v0, 0x9999999a +; SI-NEXT: v_mov_b32_e32 v1, 0xbfb99999 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mul_f64 v[0:1], s[6:7], v[0:1] +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; SI-NEXT: s_endpgm %x = load double, ptr addrspace(1) poison %rcp = fdiv fast double %x, -10.0 store double %rcp, ptr addrspace(1) %out, align 4 @@ -160,3 +695,5 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 { attributes #0 = { nounwind } attributes #1 = { nounwind "unsafe-fp-math"="true" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll index fd64ea3ae1c4b..9fa00fbe1a612 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-block-end-iterator-debugloc.ll @@ -5,8 +5,9 @@ define i32 @rocrand_regression(ptr addrspace(1) %arg, i32 %arg0, i1 %cmp7) { ; CHECK-LABEL: rocrand_regression: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v0, 1, v3 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CHECK-NEXT: v_not_b32_e32 v0, v3 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: .LBB0_1: ; %do.body ; CHECK-NEXT: ; =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll index d03d53a8cbbaa..498eed0bacaf1 100644 --- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-phi-regression-issue130646-issue130119.ll @@ -97,11 +97,14 @@ define amdgpu_cs void @issue130119(i1 %arg) { ; CHECK-NEXT: s_branch .LBB1_4 ; CHECK-NEXT: .LBB1_3: ; %Flow1 ; CHECK-NEXT: ; in Loop: Header=BB1_4 Depth=2 -; CHECK-NEXT: s_xor_b64 s[14:15], s[14:15], -1 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15] +; CHECK-NEXT: v_not_b32_e32 v0, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 ; CHECK-NEXT: s_and_b64 s[12:13], exec, s[12:13] +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_or_b64 s[10:11], s[12:13], s[10:11] ; CHECK-NEXT: s_andn2_b64 s[8:9], s[8:9], exec -; CHECK-NEXT: s_and_b64 s[12:13], s[14:15], exec +; CHECK-NEXT: s_and_b64 s[12:13], vcc, exec ; CHECK-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] ; CHECK-NEXT: s_andn2_b64 exec, exec, s[10:11] ; CHECK-NEXT: s_cbranch_execz .LBB1_8 diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll index ffe0596a95e33..a32ce877ac119 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64.ll @@ -15218,25 +15218,30 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_cbranch_execnz .LBB115_2 ; GCN1-NEXT: .LBB115_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_offset: @@ -15276,21 +15281,26 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_offset: @@ -15319,19 +15329,25 @@ define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB115_2 ; GFX12-NEXT: .LBB115_4: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s4, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] -; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %gep = getelementptr i64, ptr %out, i64 4 @@ -15346,23 +15362,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN1-NEXT: s_add_u32 s12, s12, s11 ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_add_u32 s2, s8, 32 -; GCN1-NEXT: s_addc_u32 s3, s9, 0 -; GCN1-NEXT: s_cmp_eq_u32 s3, s6 -; GCN1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN1-NEXT: s_add_u32 s0, s0, 32 +; GCN1-NEXT: s_addc_u32 s1, s1, 0 +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB116_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s2 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s3 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -15371,30 +15387,34 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: .LBB116_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB116_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[2:3], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s1 -; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN1-NEXT: s_cselect_b32 s2, s2, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB116_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -15404,23 +15424,23 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 -; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 +; GCN2-NEXT: s_add_u32 s88, s88, s11 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_add_u32 s2, s8, 32 -; GCN2-NEXT: s_addc_u32 s3, s9, 0 -; GCN2-NEXT: s_cmp_eq_u32 s3, s6 -; GCN2-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN2-NEXT: s_add_u32 s0, s0, 32 +; GCN2-NEXT: s_addc_u32 s1, s1, 0 +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB116_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s2 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s3 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -15429,29 +15449,33 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: .LBB116_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB116_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[2:3], 0 -; GCN2-NEXT: s_cselect_b32 s2, s2, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s1 -; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB116_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -15480,19 +15504,26 @@ define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB116_3: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s6, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] -; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v4, s4, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB116_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -15543,25 +15574,30 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN1-NEXT: s_cbranch_execnz .LBB117_2 ; GCN1-NEXT: .LBB117_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_decr64_offset: @@ -15605,21 +15641,26 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64_offset: @@ -15653,19 +15694,25 @@ define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 % ; GFX12-NEXT: s_cbranch_execnz .LBB117_2 ; GFX12-NEXT: .LBB117_4: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s4, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] -; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -15714,17 +15761,21 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen @@ -15772,18 +15823,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen @@ -15820,19 +15875,26 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB118_3: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s6, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] -; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v4, s4, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB118_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -15878,25 +15940,30 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN1-NEXT: s_cbranch_execnz .LBB119_2 ; GCN1-NEXT: .LBB119_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64: @@ -15934,21 +16001,26 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64: @@ -15976,19 +16048,25 @@ define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) { ; GFX12-NEXT: s_cbranch_execnz .LBB119_2 ; GFX12-NEXT: .LBB119_4: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s4, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] -; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst @@ -16001,22 +16079,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GCN1-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GCN1-NEXT: s_mov_b32 s14, -1 +; GCN1-NEXT: s_load_dword s6, s[4:5], 0x3f +; GCN1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN1-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd ; GCN1-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN1-NEXT: s_add_u32 s12, s12, s11 -; GCN1-NEXT: s_load_dword s2, s[4:5], 0x3f -; GCN1-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; GCN1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd ; GCN1-NEXT: s_addc_u32 s13, s13, 0 ; GCN1-NEXT: s_waitcnt lgkmcnt(0) -; GCN1-NEXT: s_cmp_eq_u32 s9, s2 -; GCN1-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN1-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN1-NEXT: s_cmp_eq_u32 s1, s6 +; GCN1-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN1-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN1-NEXT: s_cbranch_vccz .LBB120_2 ; GCN1-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN1-NEXT: v_mov_b32_e32 v0, s8 -; GCN1-NEXT: v_mov_b32_e32 v3, s1 -; GCN1-NEXT: v_mov_b32_e32 v1, s9 -; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: v_mov_b32_e32 v0, s0 +; GCN1-NEXT: v_mov_b32_e32 v2, s4 +; GCN1-NEXT: v_mov_b32_e32 v1, s1 +; GCN1-NEXT: v_mov_b32_e32 v3, s5 ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN1-NEXT: buffer_wbinvl1_vol @@ -16025,30 +16103,34 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN1-NEXT: .LBB120_2: ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: .LBB120_3: ; %atomicrmw.private -; GCN1-NEXT: v_cmp_ne_u64_e64 s[2:3], s[8:9], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s1 -; GCN1-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN1-NEXT: s_cselect_b32 s2, s8, -1 -; GCN1-NEXT: v_mov_b32_e32 v2, s2 -; GCN1-NEXT: s_add_i32 s2, s2, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s2 +; GCN1-NEXT: v_cmp_ne_u64_e64 s[6:7], s[0:1], 0 +; GCN1-NEXT: v_mov_b32_e32 v5, s4 +; GCN1-NEXT: s_and_b64 s[6:7], s[6:7], exec +; GCN1-NEXT: s_cselect_b32 s0, s0, -1 +; GCN1-NEXT: v_mov_b32_e32 v2, s0 +; GCN1-NEXT: s_add_i32 s0, s0, 4 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v5, s0 +; GCN1-NEXT: v_mov_b32_e32 v4, s5 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[12:15], 0 offen ; GCN1-NEXT: buffer_store_dword v4, v3, s[12:15], 0 offen ; GCN1-NEXT: .LBB120_4: ; %atomicrmw.end -; GCN1-NEXT: v_mov_b32_e32 v2, s10 -; GCN1-NEXT: v_mov_b32_e32 v3, s11 +; GCN1-NEXT: v_mov_b32_e32 v2, s2 +; GCN1-NEXT: v_mov_b32_e32 v3, s3 ; GCN1-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN1-NEXT: s_endpgm ; @@ -16057,22 +16139,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: s_mov_b32 s88, SCRATCH_RSRC_DWORD0 ; GCN2-NEXT: s_mov_b32 s89, SCRATCH_RSRC_DWORD1 ; GCN2-NEXT: s_mov_b32 s90, -1 +; GCN2-NEXT: s_load_dword s6, s[4:5], 0xfc +; GCN2-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GCN2-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GCN2-NEXT: s_mov_b32 s91, 0xe80000 ; GCN2-NEXT: s_add_u32 s88, s88, s11 -; GCN2-NEXT: s_load_dword s2, s[4:5], 0xfc -; GCN2-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GCN2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 ; GCN2-NEXT: s_addc_u32 s89, s89, 0 ; GCN2-NEXT: s_waitcnt lgkmcnt(0) -; GCN2-NEXT: s_cmp_eq_u32 s9, s2 -; GCN2-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GCN2-NEXT: s_andn2_b64 vcc, exec, s[2:3] +; GCN2-NEXT: s_cmp_eq_u32 s1, s6 +; GCN2-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN2-NEXT: s_andn2_b64 vcc, exec, s[6:7] ; GCN2-NEXT: s_cbranch_vccz .LBB120_2 ; GCN2-NEXT: ; %bb.1: ; %atomicrmw.global -; GCN2-NEXT: v_mov_b32_e32 v0, s8 -; GCN2-NEXT: v_mov_b32_e32 v3, s1 -; GCN2-NEXT: v_mov_b32_e32 v1, s9 -; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: v_mov_b32_e32 v0, s0 +; GCN2-NEXT: v_mov_b32_e32 v2, s4 +; GCN2-NEXT: v_mov_b32_e32 v1, s1 +; GCN2-NEXT: v_mov_b32_e32 v3, s5 ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc ; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GCN2-NEXT: buffer_wbinvl1_vol @@ -16081,29 +16163,33 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GCN2-NEXT: .LBB120_2: ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: .LBB120_3: ; %atomicrmw.private -; GCN2-NEXT: s_cmp_lg_u64 s[8:9], 0 -; GCN2-NEXT: s_cselect_b32 s2, s8, -1 -; GCN2-NEXT: v_mov_b32_e32 v2, s2 -; GCN2-NEXT: s_add_i32 s2, s2, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s2 +; GCN2-NEXT: s_cmp_lg_u64 s[0:1], 0 +; GCN2-NEXT: s_cselect_b32 s0, s0, -1 +; GCN2-NEXT: v_mov_b32_e32 v2, s0 +; GCN2-NEXT: s_add_i32 s0, s0, 4 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s1 -; GCN2-NEXT: v_mov_b32_e32 v5, s0 +; GCN2-NEXT: v_mov_b32_e32 v5, s4 +; GCN2-NEXT: v_mov_b32_e32 v4, s5 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[0:1], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen ; GCN2-NEXT: buffer_store_dword v4, v3, s[88:91], 0 offen ; GCN2-NEXT: .LBB120_4: ; %atomicrmw.end -; GCN2-NEXT: v_mov_b32_e32 v2, s10 -; GCN2-NEXT: v_mov_b32_e32 v3, s11 +; GCN2-NEXT: v_mov_b32_e32 v2, s2 +; GCN2-NEXT: v_mov_b32_e32 v3, s3 ; GCN2-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN2-NEXT: s_endpgm ; @@ -16131,19 +16217,26 @@ define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) { ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB120_3: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s6, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] -; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v4, s4, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB120_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] @@ -16191,25 +16284,30 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN1-NEXT: s_cbranch_execnz .LBB121_2 ; GCN1-NEXT: .LBB121_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e64 s[4:5], s[0:1], 0 -; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: v_mov_b32_e32 v5, s2 ; GCN1-NEXT: s_and_b64 s[4:5], s[4:5], exec ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[12:15], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[12:15], 0 offen -; GCN1-NEXT: v_mov_b32_e32 v5, s2 +; GCN1-NEXT: v_mov_b32_e32 v4, s3 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[2:3], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN1-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[12:15], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[12:15], 0 offen +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[12:15], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[12:15], 0 offen ; GCN1-NEXT: s_endpgm ; ; GCN2-LABEL: atomic_dec_i64_decr64: @@ -16251,21 +16349,26 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen -; GCN2-NEXT: v_mov_b32_e32 v4, s3 ; GCN2-NEXT: v_mov_b32_e32 v5, s2 +; GCN2-NEXT: v_mov_b32_e32 v4, s3 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[2:3], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[2:3], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] -; GCN2-NEXT: v_addc_u32_e64 v1, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[88:91], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[88:91], 0 offen +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[88:91], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[88:91], 0 offen ; GCN2-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_dec_i64_decr64: @@ -16297,19 +16400,25 @@ define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) ; GFX12-NEXT: s_cbranch_execnz .LBB121_2 ; GFX12-NEXT: .LBB121_4: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s4, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s4 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[2:3], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], v[0:1] -; GFX12-NEXT: v_add_co_u32 v0, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0 -; GFX12-NEXT: scratch_store_b64 off, v[0:1], s4 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v0, vcc_lo, v0, -1 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v1, null, -1, v1, vcc_lo +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v1, v1, s3, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[0:1], s0 ; GFX12-NEXT: s_endpgm entry: %ptr = getelementptr i64, ptr %out, i64 %index @@ -16355,17 +16464,21 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN1-NEXT: s_cselect_b32 s0, s0, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s0 ; GCN1-NEXT: s_add_i32 s0, s0, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v0, v2, s[16:19], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s0 ; GCN1-NEXT: buffer_load_dword v1, v3, s[16:19], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v4, s13 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[2:3], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[16:19], 0 offen @@ -16411,18 +16524,22 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GCN2-NEXT: s_cselect_b32 s0, s0, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s0 ; GCN2-NEXT: s_add_i32 s0, s0, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v0, v2, s[88:91], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s0 ; GCN2-NEXT: buffer_load_dword v1, v3, s[88:91], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v5, s12 ; GCN2-NEXT: v_mov_b32_e32 v4, s13 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[2:3], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[0:1], s[12:13], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[2:3], -1, v1, s[2:3] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[88:91], 0 offen @@ -16458,19 +16575,26 @@ define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %i ; GFX12-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GFX12-NEXT: .LBB122_3: ; %atomicrmw.private ; GFX12-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX12-NEXT: s_cselect_b32 s6, s0, -1 -; GFX12-NEXT: scratch_load_b64 v[0:1], off, s6 +; GFX12-NEXT: s_cselect_b32 s0, s0, -1 +; GFX12-NEXT: scratch_load_b64 v[0:1], off, s0 ; GFX12-NEXT: s_wait_loadcnt 0x0 +; GFX12-NEXT: v_cmp_lt_u64_e32 vcc_lo, s[4:5], v[0:1] +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[0:1] -; GFX12-NEXT: v_cmp_lt_u64_e64 s0, s[4:5], v[0:1] -; GFX12-NEXT: v_add_co_u32 v2, s1, v0, -1 -; GFX12-NEXT: s_wait_alu 0xf1ff -; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, s1 -; GFX12-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX12-NEXT: s_wait_alu 0xfffe -; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, s0 -; GFX12-NEXT: v_cndmask_b32_e64 v2, v2, s4, s0 -; GFX12-NEXT: scratch_store_b64 off, v[2:3], s6 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX12-NEXT: v_add_co_u32 v4, vcc_lo, v0, -1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, -1, v1, vcc_lo +; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, v3, s5, vcc_lo +; GFX12-NEXT: v_cndmask_b32_e64 v2, v4, s4, vcc_lo +; GFX12-NEXT: scratch_store_b64 off, v[2:3], s0 ; GFX12-NEXT: .LBB122_4: ; %atomicrmw.end ; GFX12-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX12-NEXT: flat_store_b64 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll index 23dfe2f70fa7e..8e829adfd6cf5 100644 --- a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll +++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system.ll @@ -21820,10 +21820,10 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB141_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB141_4 ; GCN1-NEXT: .LBB141_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -21831,25 +21831,30 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB141_2 ; GCN1-NEXT: .LBB141_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -21864,10 +21869,10 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB141_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB141_4 ; GCN2-NEXT: .LBB141_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -21875,25 +21880,30 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB141_2 ; GCN2-NEXT: .LBB141_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -21906,10 +21916,10 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB141_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB141_4 ; GCN3-NEXT: .LBB141_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB141_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -21917,24 +21927,29 @@ define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB141_2 ; GCN3-NEXT: .LBB141_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN3-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -21955,10 +21970,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB142_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB142_4 ; GCN1-NEXT: .LBB142_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -21966,25 +21981,30 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB142_2 ; GCN1-NEXT: .LBB142_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22001,10 +22021,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB142_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB142_4 ; GCN2-NEXT: .LBB142_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -22012,25 +22032,30 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB142_2 ; GCN2-NEXT: .LBB142_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22045,10 +22070,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB142_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB142_4 ; GCN3-NEXT: .LBB142_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB142_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -22056,24 +22081,29 @@ define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB142_2 ; GCN3-NEXT: .LBB142_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN3-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -22096,10 +22126,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB143_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB143_4 ; GCN1-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB143_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -22107,26 +22137,30 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB143_2 ; GCN1-NEXT: .LBB143_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22144,10 +22178,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB143_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB143_4 ; GCN2-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB143_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -22155,26 +22189,30 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB143_2 ; GCN2-NEXT: .LBB143_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22190,10 +22228,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB143_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB143_4 ; GCN3-NEXT: .LBB143_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB143_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -22201,7 +22239,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB143_2 ; GCN3-NEXT: .LBB143_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] @@ -22209,17 +22247,21 @@ define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) { ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN3-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -22241,10 +22283,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB144_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB144_4 ; GCN1-NEXT: .LBB144_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -22252,26 +22294,30 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB144_2 ; GCN1-NEXT: .LBB144_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22289,10 +22335,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB144_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB144_4 ; GCN2-NEXT: .LBB144_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -22300,26 +22346,30 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB144_2 ; GCN2-NEXT: .LBB144_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22335,10 +22385,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB144_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB144_4 ; GCN3-NEXT: .LBB144_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB144_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -22346,7 +22396,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB144_2 ; GCN3-NEXT: .LBB144_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] @@ -22354,17 +22404,21 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) { ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN3-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -22405,20 +22459,25 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22452,21 +22511,26 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22501,16 +22565,21 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN3-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst @@ -22552,20 +22621,25 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v4, s7 +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_add_i32_e64 v0, s[36:37], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN1-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN1-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN1-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -22601,21 +22675,26 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_add_u32_e64 v0, s[36:37], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN2-NEXT: v_addc_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc -; GCN2-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v3, s[0:3], 0 offen +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v5, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v4, vcc +; GCN2-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v3, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -22652,16 +22731,21 @@ define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg ; GCN3-NEXT: buffer_load_dword v1, v2, s[0:3], 0 offen offset:4 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[36:37], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen offset:4 +; GCN3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN3-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -22699,17 +22783,21 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN1-NEXT: s_cselect_b32 s34, s4, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen @@ -22745,18 +22833,22 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN2-NEXT: s_cselect_b32 s34, s4, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen @@ -22794,12 +22886,16 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 ; GCN3-NEXT: v_mov_b32_e32 v4, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0 +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN3-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen @@ -22843,17 +22939,21 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN1-NEXT: s_cselect_b32 s34, s34, -1 ; GCN1-NEXT: v_mov_b32_e32 v2, s34 ; GCN1-NEXT: s_add_i32 s34, s34, 4 -; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN1-NEXT: v_mov_b32_e32 v3, s34 ; GCN1-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN1-NEXT: v_mov_b32_e32 v4, s7 ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[36:37], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN1-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN1-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen @@ -22891,18 +22991,22 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN2-NEXT: s_cselect_b32 s34, s34, -1 ; GCN2-NEXT: v_mov_b32_e32 v2, s34 ; GCN2-NEXT: s_add_i32 s34, s34, 4 -; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v0, v2, s[0:3], 0 offen +; GCN2-NEXT: v_mov_b32_e32 v3, s34 ; GCN2-NEXT: buffer_load_dword v1, v3, s[0:3], 0 offen ; GCN2-NEXT: v_mov_b32_e32 v5, s6 ; GCN2-NEXT: v_mov_b32_e32 v4, s7 ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[36:37], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN2-NEXT: v_addc_u32_e64 v7, s[36:37], -1, v1, s[36:37] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v4, v7, v4, vcc ; GCN2-NEXT: buffer_store_dword v5, v2, s[0:3], 0 offen @@ -22942,12 +23046,16 @@ define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %ou ; GCN3-NEXT: v_mov_b32_e32 v4, s6 ; GCN3-NEXT: v_mov_b32_e32 v3, s7 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[36:37], -1, v0 +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_lt_u64_e64 s[34:35], s[6:7], v[0:1] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[36:37], -1, v1, s[36:37] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[34:35] +; GCN3-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN3-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN3-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v4, v2, s[0:3], 0 offen @@ -22974,10 +23082,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB149_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB149_4 ; GCN1-NEXT: .LBB149_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -22985,25 +23093,30 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB149_2 ; GCN1-NEXT: .LBB149_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN1-NEXT: s_waitcnt vmcnt(1) +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_add_i32_e64 v0, s[6:7], -1, v0 -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN1-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN1-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN1-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN1-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN1-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN1-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN1-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GCN1-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN1-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -23020,10 +23133,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB149_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB149_4 ; GCN2-NEXT: .LBB149_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -23031,25 +23144,30 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB149_2 ; GCN2-NEXT: .LBB149_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen +; GCN2-NEXT: s_waitcnt vmcnt(1) +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_add_u32_e64 v0, s[6:7], -1, v0 -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN2-NEXT: v_addc_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN2-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN2-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN2-NEXT: buffer_store_dword v1, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v0, v0, v8 +; GCN2-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN2-NEXT: v_cndmask_b32_e32 v1, v6, v2, vcc +; GCN2-NEXT: v_cndmask_b32_e32 v0, v7, v3, vcc +; GCN2-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN2-NEXT: buffer_store_dword v0, v5, s[0:3], 0 offen +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -23064,10 +23182,10 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB149_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB149_4 ; GCN3-NEXT: .LBB149_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB149_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3] @@ -23075,24 +23193,29 @@ define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB149_2 ; GCN3-NEXT: .LBB149_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1] ; GCN3-NEXT: v_cndmask_b32_e32 v4, -1, v0, vcc ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_waitcnt vmcnt(1) +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_add_co_u32_e64 v0, s[6:7], -1, v0 -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] -; GCN3-NEXT: v_addc_co_u32_e64 v1, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc -; GCN3-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc -; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen -; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v0, v0, v7 +; GCN3-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN3-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc +; GCN3-NEXT: v_cndmask_b32_e32 v0, v6, v3, vcc +; GCN3-NEXT: buffer_store_dword v1, v4, s[0:3], 0 offen +; GCN3-NEXT: buffer_store_dword v0, v4, s[0:3], 0 offen offset:4 +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 @@ -23115,10 +23238,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB150_3 ; GCN1-NEXT: ; %bb.1: ; %Flow -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execnz .LBB150_4 ; GCN1-NEXT: .LBB150_2: ; %atomicrmw.phi -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_setpc_b64 s[30:31] ; GCN1-NEXT: .LBB150_3: ; %atomicrmw.global ; GCN1-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -23126,26 +23249,30 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN1-NEXT: buffer_wbinvl1_vol ; GCN1-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN1-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN1-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN1-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN1-NEXT: s_cbranch_execz .LBB150_2 ; GCN1-NEXT: .LBB150_4: ; %atomicrmw.private ; GCN1-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN1-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN1-NEXT: v_add_i32_e32 v5, vcc, 4, v4 ; GCN1-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GCN1-NEXT: s_waitcnt vmcnt(1) -; GCN1-NEXT: v_add_i32_e64 v6, s[6:7], -1, v0 +; GCN1-NEXT: v_add_i32_e32 v6, vcc, -1, v0 ; GCN1-NEXT: s_waitcnt vmcnt(0) +; GCN1-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN1-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN1-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN1-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN1-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN1-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN1-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN1-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN1-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN1-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN1-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN1-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN1-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN1-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN1-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN1-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN1-NEXT: s_waitcnt vmcnt(0) ; GCN1-NEXT: s_setpc_b64 s[30:31] ; @@ -23163,10 +23290,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN2-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB150_3 ; GCN2-NEXT: ; %bb.1: ; %Flow -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execnz .LBB150_4 ; GCN2-NEXT: .LBB150_2: ; %atomicrmw.phi -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_setpc_b64 s[30:31] ; GCN2-NEXT: .LBB150_3: ; %atomicrmw.global ; GCN2-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -23174,26 +23301,30 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN2-NEXT: buffer_wbinvl1_vol ; GCN2-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN2-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN2-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN2-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN2-NEXT: s_cbranch_execz .LBB150_2 ; GCN2-NEXT: .LBB150_4: ; %atomicrmw.private ; GCN2-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] ; GCN2-NEXT: v_cndmask_b32_e32 v4, -1, v4, vcc -; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen +; GCN2-NEXT: v_add_u32_e32 v5, vcc, 4, v4 ; GCN2-NEXT: buffer_load_dword v1, v5, s[0:3], 0 offen ; GCN2-NEXT: s_waitcnt vmcnt(1) -; GCN2-NEXT: v_add_u32_e64 v6, s[6:7], -1, v0 +; GCN2-NEXT: v_add_u32_e32 v6, vcc, -1, v0 ; GCN2-NEXT: s_waitcnt vmcnt(0) +; GCN2-NEXT: v_addc_u32_e32 v7, vcc, -1, v1, vcc +; GCN2-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN2-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc ; GCN2-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN2-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN2-NEXT: v_addc_u32_e64 v7, s[6:7], -1, v1, s[6:7] -; GCN2-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN2-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN2-NEXT: v_or_b32_e32 v8, v9, v8 +; GCN2-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; GCN2-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc ; GCN2-NEXT: v_cndmask_b32_e32 v3, v7, v3, vcc ; GCN2-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN2-NEXT: buffer_store_dword v3, v5, s[0:3], 0 offen -; GCN2-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN2-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN2-NEXT: s_waitcnt vmcnt(0) ; GCN2-NEXT: s_setpc_b64 s[30:31] ; @@ -23209,10 +23340,10 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB150_3 ; GCN3-NEXT: ; %bb.1: ; %Flow -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execnz .LBB150_4 ; GCN3-NEXT: .LBB150_2: ; %atomicrmw.phi -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_setpc_b64 s[30:31] ; GCN3-NEXT: .LBB150_3: ; %atomicrmw.global ; GCN3-NEXT: flat_atomic_dec_x2 v[0:1], v[4:5], v[2:3] glc @@ -23220,7 +23351,7 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_wbinvl1_vol ; GCN3-NEXT: ; implicit-def: $vgpr4_vgpr5 ; GCN3-NEXT: ; implicit-def: $vgpr2_vgpr3 -; GCN3-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5] +; GCN3-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] ; GCN3-NEXT: s_cbranch_execz .LBB150_2 ; GCN3-NEXT: .LBB150_4: ; %atomicrmw.private ; GCN3-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] @@ -23228,17 +23359,21 @@ define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %o ; GCN3-NEXT: buffer_load_dword v0, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_load_dword v1, v4, s[0:3], 0 offen offset:4 ; GCN3-NEXT: s_waitcnt vmcnt(1) -; GCN3-NEXT: v_add_co_u32_e64 v5, s[6:7], -1, v0 +; GCN3-NEXT: v_add_co_u32_e32 v5, vcc, -1, v0 ; GCN3-NEXT: s_waitcnt vmcnt(0) +; GCN3-NEXT: v_addc_co_u32_e32 v6, vcc, -1, v1, vcc +; GCN3-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3] +; GCN3-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GCN3-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN3-NEXT: v_cmp_gt_u64_e64 s[4:5], v[0:1], v[2:3] -; GCN3-NEXT: v_addc_co_u32_e64 v6, s[6:7], -1, v1, s[6:7] -; GCN3-NEXT: s_or_b64 vcc, vcc, s[4:5] +; GCN3-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GCN3-NEXT: v_or_b32_e32 v7, v8, v7 +; GCN3-NEXT: v_and_b32_e32 v7, 1, v7 +; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 ; GCN3-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc ; GCN3-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc ; GCN3-NEXT: buffer_store_dword v2, v4, s[0:3], 0 offen ; GCN3-NEXT: buffer_store_dword v3, v4, s[0:3], 0 offen offset:4 -; GCN3-NEXT: s_or_b64 exec, exec, s[8:9] +; GCN3-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN3-NEXT: s_waitcnt vmcnt(0) ; GCN3-NEXT: s_setpc_b64 s[30:31] %gep = getelementptr i64, ptr %out, i64 4 diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll index f199db3ca12ca..9b23d4f377772 100644 --- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll @@ -4030,7 +4030,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-NEXT: v_mov_b32_e32 v2, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4045,44 +4045,46 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 { ; GFX11-NEXT: v_bfe_u32 v3, v1, 20, 11 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v1 ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fpround = fptrunc double %a to half %fneg = fneg half %fpround @@ -4180,7 +4182,7 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-NEXT: v_mov_b32_e32 v2, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: s_setpc_b64 s[30:31] @@ -4195,43 +4197,44 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; GFX11-NEXT: v_sub_nc_u32_e32 v4, 0x3f1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v1, 0xffff8000, v1 ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v0, 0xffe, v2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_med3_i32 v2, v4, 0, 13 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, 0x1000, v0 -; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_lshrrev_b32_e32 v5, v2, v4 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, v2, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, v2, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v2, v5, v2 ; GFX11-NEXT: v_add_nc_u32_e32 v3, 0xfffffc10, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_lshl_or_b32 v4, v3, 12, v0 ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_and_b32_e32 v4, 7, v2 ; GFX11-NEXT: v_lshrrev_b32_e32 v2, 2, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_lt_i32_e32 vcc_lo, 5, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v4 ; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_or_b32_e32 v4, v4, v5 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v5, 0x7e00 :: v_dual_add_nc_u32 v2, v2, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 31, v3 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg double %a %fpround = fptrunc double %fneg.a to half @@ -4336,7 +4339,7 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; VI-NEXT: v_cndmask_b32_e32 v0, v6, v7, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v5 ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; VI-NEXT: v_mov_b32_e32 v4, 0x8000 +; VI-NEXT: v_mov_b32_e32 v4, 0xffff8000 ; VI-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; VI-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 @@ -4387,9 +4390,9 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v3, v2 :: v_dual_and_b32 v5, 0xffff8000, v5 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_and_or_b32 v3, 0x8000, v5, v2 +; GFX11-NEXT: v_or_b32_e32 v3, v5, v2 ; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 ; GFX11-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -4599,7 +4602,7 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; VI-NEXT: v_cndmask_b32_e32 v4, v7, v8, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v6 ; VI-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc -; VI-NEXT: v_mov_b32_e32 v4, 0x8000 +; VI-NEXT: v_mov_b32_e32 v4, 0xffff8000 ; VI-NEXT: v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v0, v1, v0 ; VI-NEXT: v_mov_b32_e32 v1, v2 @@ -4651,8 +4654,8 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do ; GFX11-NEXT: v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo -; GFX11-NEXT: v_and_or_b32 v0, 0x8000, v1, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v1, 0xffff8000, v1 +; GFX11-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11-NEXT: v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %fneg.a = fneg double %a @@ -4757,7 +4760,7 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; VI-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s4, v3 ; VI-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; VI-NEXT: v_mov_b32_e32 v2, 0x8000 +; VI-NEXT: v_mov_b32_e32 v2, 0xffff8000 ; VI-NEXT: v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; VI-NEXT: v_or_b32_e32 v1, v1, v0 ; VI-NEXT: v_xor_b32_e32 v0, 0x8000, v1 @@ -4807,9 +4810,9 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 { ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3 -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xffff8000, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_and_or_b32 v1, 0x8000, v1, v0 +; GFX11-NEXT: v_or_b32_e32 v1, v1, v0 ; GFX11-NEXT: v_xor_b32_e32 v0, 0x8000, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] ; GFX11-SAFE-TRUE16-LABEL: v_fneg_multi_use_fp_round_fneg_f64_to_f16: diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll index 498df8a65feda..c763865f1e9b1 100644 --- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll @@ -138,25 +138,33 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; SI-LABEL: test_isfinite_pattern_0: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_load_dword s6, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s6|, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s6, s6 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v2, 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -166,12 +174,16 @@ define amdgpu_kernel void @test_isfinite_pattern_0(ptr addrspace(1) nocapture %o ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s2| +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -280,29 +292,33 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; SI-LABEL: test_isfinite_not_pattern_1: ; SI: ; %bb.0: ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dword s4, s[4:5], 0xb ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_neq_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_cmp_o_f32_e64 s[4:5], s4, s4 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_not_pattern_1: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s6, s[4:5], 0x2c +; VI-NEXT: s_load_dword s2, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s6, s6 -; VI-NEXT: v_cmp_neq_f32_e32 vcc, s6, v0 -; VI-NEXT: s_and_b64 s[2:3], s[2:3], vcc +; VI-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v2, 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -312,14 +328,16 @@ define amdgpu_kernel void @test_isfinite_not_pattern_1(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, s2 -; GFX11-NEXT: s_and_b32 s2, s3, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, s2 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %ninf = fcmp une float %x, 0x7FF0000000000000 @@ -340,10 +358,12 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 -; SI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: v_cmp_neq_f32_e64 s[0:1], |s3|, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_cmp_o_f32_e64 s[0:1], s2, s2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -352,11 +372,13 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_o_f32_e64 s[4:5], s2, s2 -; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s3|, v0 -; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s3|, v0 +; VI-NEXT: v_cmp_o_f32_e64 s[2:3], s2, s2 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v2, 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -364,14 +386,16 @@ define amdgpu_kernel void @test_isfinite_not_pattern_2(ptr addrspace(1) nocaptur ; GFX11-LABEL: test_isfinite_not_pattern_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 ; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s3| -; GFX11-NEXT: s_and_b32 s2, s2, s3 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %y) #1 @@ -392,10 +416,12 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 -; SI-NEXT: v_cmp_neq_f32_e64 s[6:7], |s6|, v0 -; SI-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0 ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cmp_u_f32_e64 s[4:5], s6, s6 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -405,11 +431,13 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x7f800000 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_cmp_neq_f32_e64 s[2:3], |s6|, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; VI-NEXT: v_cmp_u_f32_e64 s[2:3], s6, s6 -; VI-NEXT: v_cmp_neq_f32_e64 s[4:5], |s6|, v0 -; VI-NEXT: s_and_b64 s[2:3], s[2:3], s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v2, 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -419,14 +447,16 @@ define amdgpu_kernel void @test_isfinite_not_pattern_3(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_u_f32_e64 s3, s2, s2 -; GFX11-NEXT: v_cmp_neq_f32_e64 s2, 0x7f800000, |s2| -; GFX11-NEXT: s_and_b32 s2, s3, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_neq_f32_e64 s3, 0x7f800000, |s2| +; GFX11-NEXT: v_cmp_u_f32_e64 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %ord = fcmp uno float %x, 0.000000e+00 %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -536,35 +566,39 @@ define amdgpu_kernel void @test_isfinite_pattern_4_commute_and(ptr addrspace(1) define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrspace(1) nocapture %out, float %x, [8 x i32], float %y) #0 { ; SI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; SI: ; %bb.0: -; SI-NEXT: s_load_dword s0, s[4:5], 0x14 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 -; SI-NEXT: s_load_dword s1, s[4:5], 0xb -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 +; SI-NEXT: s_load_dword s6, s[4:5], 0x14 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s0 -; SI-NEXT: v_cmp_o_f32_e32 vcc, s1, v1 -; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s1, v0 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; SI-NEXT: v_mov_b32_e32 v1, s6 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_cmp_o_f32_e32 vcc, s4, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_not_isfinite_pattern_4_wrong_ord_test: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s0, s[4:5], 0x50 -; VI-NEXT: s_load_dword s1, s[4:5], 0x2c -; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; VI-NEXT: s_load_dword s2, s[4:5], 0x50 +; VI-NEXT: s_load_dword s3, s[4:5], 0x2c +; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s0 -; VI-NEXT: v_cmp_class_f32_e32 vcc, s1, v0 -; VI-NEXT: v_cmp_o_f32_e64 s[0:1], s1, v1 -; VI-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v1, s2 +; VI-NEXT: v_cmp_class_f32_e32 vcc, s3, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_cmp_o_f32_e32 vcc, s3, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v2, 1, v0 +; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -574,14 +608,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x50 ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_o_f32_e64 s3, s2, s3 -; GFX11-NEXT: v_cmp_class_f32_e64 s2, s2, 0x1f8 -; GFX11-NEXT: s_and_b32 s2, s3, s2 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-NEXT: v_cmp_class_f32_e64 s4, s2, 0x1f8 +; GFX11-NEXT: v_cmp_o_f32_e64 s2, s2, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %ord = fcmp ord float %x, %y %x.fabs = tail call float @llvm.fabs.f32(float %x) #1 @@ -649,23 +685,29 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cvt_f32_f16_e32 v0, s6 ; SI-NEXT: s_and_b32 s4, s6, 0x7fff -; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 ; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00 ; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 -; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: test_isfinite_pattern_0_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dword s2, s[4:5], 0x2c +; VI-NEXT: s_load_dword s6, s[4:5], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 -; VI-NEXT: v_mov_b32_e32 v0, 0x1f8 +; VI-NEXT: v_mov_b32_e32 v0, 0x7c00 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cmp_class_f16_e32 vcc, s2, v0 +; VI-NEXT: v_cmp_neq_f16_e64 s[2:3], |s6|, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: v_cmp_o_f16_e64 s[2:3], s6, s6 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v2, 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm @@ -675,12 +717,16 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[4:5], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_cmp_class_f16_e64 s2, s2, 0x1f8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cmp_neq_f16_e64 s3, 0x7c00, |s2| +; GFX11-NEXT: v_cmp_o_f16_e64 s2, s2, s2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 ; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 -; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-NEXT: s_endpgm %ord = fcmp ord half %x, 0.0 %x.fabs = tail call half @llvm.fabs.f16(half %x) #1 diff --git a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll index c685ad75b47f7..51e7e5836ebd3 100644 --- a/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/fp_trunc_store_fp64_to_bf16.ll @@ -7,14 +7,15 @@ define void @scalar(double %num, ptr addrspace(1) %p) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; CHECK-NEXT: v_and_b32_e32 v7, 1, v6 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v4, v6, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v6, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v7, v6, v7 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 ; CHECK-NEXT: s_movk_i32 s4, 0x7fff ; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4 @@ -36,32 +37,34 @@ define void @v2(<2 x double> %num, ptr addrspace(1) %p) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_f64_e32 v8, v[0:1] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[6:7], v8 -; CHECK-NEXT: v_and_b32_e32 v9, 1, v8 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[6:7]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[6:7] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v9 -; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v6, v8, v6 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v6, v6, v8, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[6:7]| +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v6, v8, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v9, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v6, 1, v6 +; CHECK-NEXT: v_add_u32_e32 v9, v8, v9 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc ; CHECK-NEXT: v_bfe_u32 v7, v6, 16, 1 -; CHECK-NEXT: s_movk_i32 s8, 0x7fff -; CHECK-NEXT: v_add3_u32 v7, v7, v6, s8 +; CHECK-NEXT: s_movk_i32 s6, 0x7fff +; CHECK-NEXT: v_add3_u32 v7, v7, v6, s6 ; CHECK-NEXT: v_or_b32_e32 v6, 0x400000, v6 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v7, v[2:3] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 -; CHECK-NEXT: v_and_b32_e32 v8, 1, v7 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v0, v7, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[2:3]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v7, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v8, v7, v8 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v7, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -82,32 +85,34 @@ define void @v3(<3 x double> %num, ptr addrspace(1) %p) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_f64_e32 v10, v[0:1] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 -; CHECK-NEXT: v_and_b32_e32 v11, 1, v10 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[8:9]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[8:9] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v8, v10, v8 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[8:9]| +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v8, v10, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v8, 1, v8 +; CHECK-NEXT: v_add_u32_e32 v11, v10, v11 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc ; CHECK-NEXT: v_bfe_u32 v9, v8, 16, 1 -; CHECK-NEXT: s_movk_i32 s8, 0x7fff -; CHECK-NEXT: v_add3_u32 v9, v9, v8, s8 +; CHECK-NEXT: s_movk_i32 s6, 0x7fff +; CHECK-NEXT: v_add3_u32 v9, v9, v8, s6 ; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v9, v[2:3] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 -; CHECK-NEXT: v_and_b32_e32 v10, 1, v9 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v0, v9, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v9, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[2:3]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v9, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v10, v9, v10 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v10, v9, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -115,16 +120,17 @@ define void @v3(<3 x double> %num, ptr addrspace(1) %p) { ; CHECK-NEXT: v_cvt_f32_f64_e32 v3, v[4:5] ; CHECK-NEXT: v_perm_b32 v2, v0, v8, s4 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CHECK-NEXT: v_and_b32_e32 v8, 1, v3 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v0, v3, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[4:5]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v3, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v8, v3, v8 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc @@ -144,68 +150,72 @@ define void @v4(<4 x double> %num, ptr addrspace(1) %p) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_f64_e32 v12, v[4:5] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[10:11], v12 -; CHECK-NEXT: v_and_b32_e32 v13, 1, v12 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[10:11]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[10:11] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v13 -; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v10, v12, v10 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[4:5]|, |v[10:11]| +; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v10, v12, 1, v10 +; CHECK-NEXT: v_cndmask_b32_e64 v13, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v10, 1, v10 +; CHECK-NEXT: v_add_u32_e32 v13, v12, v13 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; CHECK-NEXT: v_cndmask_b32_e32 v10, v13, v12, vcc ; CHECK-NEXT: v_bfe_u32 v11, v10, 16, 1 -; CHECK-NEXT: s_movk_i32 s8, 0x7fff -; CHECK-NEXT: v_add3_u32 v11, v11, v10, s8 +; CHECK-NEXT: s_movk_i32 s6, 0x7fff +; CHECK-NEXT: v_add3_u32 v11, v11, v10, s6 ; CHECK-NEXT: v_or_b32_e32 v10, 0x400000, v10 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5] ; CHECK-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v11, v[6:7] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v11 -; CHECK-NEXT: v_and_b32_e32 v12, 1, v11 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[6:7], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v12 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v4, v11, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v11, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[6:7]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v11, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v12, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v12, v11, v12 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v12, v11, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v5, v5, v4, s8 +; CHECK-NEXT: v_add3_u32 v5, v5, v4, s6 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; CHECK-NEXT: s_mov_b32 s9, 0x7060302 -; CHECK-NEXT: v_perm_b32 v5, v4, v10, s9 +; CHECK-NEXT: s_mov_b32 s7, 0x7060302 +; CHECK-NEXT: v_perm_b32 v5, v4, v10, s7 ; CHECK-NEXT: v_cvt_f32_f64_e32 v4, v[0:1] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; CHECK-NEXT: v_and_b32_e32 v10, 1, v4 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[6:7]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[6:7] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v6, v4, v6 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[6:7]| +; CHECK-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v6, v4, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v6, 1, v6 +; CHECK-NEXT: v_add_u32_e32 v10, v4, v10 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v10, v4, vcc ; CHECK-NEXT: v_bfe_u32 v6, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v6, v6, v4, s8 +; CHECK-NEXT: v_add3_u32 v6, v6, v4, s6 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v6, v[2:3] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v6 -; CHECK-NEXT: v_and_b32_e32 v7, 1, v6 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v0, v6, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[2:3]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v6, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v7, v6, v7 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v6, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CHECK-NEXT: v_perm_b32 v4, v0, v4, s9 +; CHECK-NEXT: v_perm_b32 v4, v0, v4, s7 ; CHECK-NEXT: global_store_dwordx2 v[8:9], v[4:5], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -221,130 +231,138 @@ define void @v8(<8 x double> %num, ptr addrspace(1) %p) { ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: v_cvt_f32_f64_e32 v20, v[12:13] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[18:19], v20 -; CHECK-NEXT: v_and_b32_e32 v21, 1, v20 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, |v[18:19]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[12:13], v[18:19] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v21 -; CHECK-NEXT: v_cndmask_b32_e64 v18, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v18, v20, v18 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v18, v18, v20, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[12:13]|, |v[18:19]| +; CHECK-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v18, v20, 1, v18 +; CHECK-NEXT: v_cndmask_b32_e64 v21, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v18, 1, v18 +; CHECK-NEXT: v_add_u32_e32 v21, v20, v21 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; CHECK-NEXT: v_cndmask_b32_e32 v18, v21, v20, vcc ; CHECK-NEXT: v_bfe_u32 v19, v18, 16, 1 -; CHECK-NEXT: s_movk_i32 s8, 0x7fff -; CHECK-NEXT: v_add3_u32 v19, v19, v18, s8 +; CHECK-NEXT: s_movk_i32 s6, 0x7fff +; CHECK-NEXT: v_add3_u32 v19, v19, v18, s6 ; CHECK-NEXT: v_or_b32_e32 v18, 0x400000, v18 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[12:13] ; CHECK-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v19, v[14:15] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[12:13], v19 -; CHECK-NEXT: v_and_b32_e32 v20, 1, v19 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[14:15]|, |v[12:13]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[14:15], v[12:13] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20 -; CHECK-NEXT: v_cndmask_b32_e64 v12, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v12, v19, v12 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v19, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[14:15]|, |v[12:13]| +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v12, v19, 1, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v20, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v12, 1, v12 +; CHECK-NEXT: v_add_u32_e32 v20, v19, v20 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; CHECK-NEXT: v_cndmask_b32_e32 v12, v20, v19, vcc ; CHECK-NEXT: v_bfe_u32 v13, v12, 16, 1 -; CHECK-NEXT: v_add3_u32 v13, v13, v12, s8 +; CHECK-NEXT: v_add3_u32 v13, v13, v12, s6 ; CHECK-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[14:15] ; CHECK-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; CHECK-NEXT: s_mov_b32 s9, 0x7060302 -; CHECK-NEXT: v_perm_b32 v13, v12, v18, s9 +; CHECK-NEXT: s_mov_b32 s7, 0x7060302 +; CHECK-NEXT: v_perm_b32 v13, v12, v18, s7 ; CHECK-NEXT: v_cvt_f32_f64_e32 v12, v[8:9] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[14:15], v12 -; CHECK-NEXT: v_and_b32_e32 v18, 1, v12 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[8:9]|, |v[14:15]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[8:9], v[14:15] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18 -; CHECK-NEXT: v_cndmask_b32_e64 v14, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v14, v12, v14 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[8:9]|, |v[14:15]| +; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v14, v12, 1, v14 +; CHECK-NEXT: v_cndmask_b32_e64 v18, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v14, 1, v14 +; CHECK-NEXT: v_add_u32_e32 v18, v12, v18 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; CHECK-NEXT: v_cndmask_b32_e32 v12, v18, v12, vcc ; CHECK-NEXT: v_bfe_u32 v14, v12, 16, 1 -; CHECK-NEXT: v_add3_u32 v14, v14, v12, s8 +; CHECK-NEXT: v_add3_u32 v14, v14, v12, s6 ; CHECK-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[8:9] ; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v14, v[10:11] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 -; CHECK-NEXT: v_and_b32_e32 v15, 1, v14 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[10:11]|, |v[8:9]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[10:11], v[8:9] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v8, v14, v8 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[10:11]|, |v[8:9]| +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v8, v14, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v15, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v8, 1, v8 +; CHECK-NEXT: v_add_u32_e32 v15, v14, v15 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v8, v15, v14, vcc ; CHECK-NEXT: v_bfe_u32 v9, v8, 16, 1 -; CHECK-NEXT: v_add3_u32 v9, v9, v8, s8 +; CHECK-NEXT: v_add3_u32 v9, v9, v8, s6 ; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[10:11] ; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v10, v[4:5] -; CHECK-NEXT: v_perm_b32 v12, v8, v12, s9 +; CHECK-NEXT: v_perm_b32 v12, v8, v12, s7 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 -; CHECK-NEXT: v_and_b32_e32 v11, 1, v10 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[8:9]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[8:9] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v8, v10, v8 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[4:5]|, |v[8:9]| +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v8, v10, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v8, 1, v8 +; CHECK-NEXT: v_add_u32_e32 v11, v10, v11 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc ; CHECK-NEXT: v_bfe_u32 v9, v8, 16, 1 -; CHECK-NEXT: v_add3_u32 v9, v9, v8, s8 +; CHECK-NEXT: v_add3_u32 v9, v9, v8, s6 ; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5] ; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v9, v[6:7] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v9 -; CHECK-NEXT: v_and_b32_e32 v10, 1, v9 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[6:7], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v4, v9, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[6:7]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v9, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v10, v9, v10 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v10, v9, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v5, v5, v4, s8 +; CHECK-NEXT: v_add3_u32 v5, v5, v4, s6 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] -; CHECK-NEXT: v_perm_b32 v11, v4, v8, s9 +; CHECK-NEXT: v_perm_b32 v11, v4, v8, s7 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; CHECK-NEXT: v_and_b32_e32 v7, 1, v6 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v4, v6, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[0:1]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v6, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v7, v6, v7 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v5, v5, v4, s8 +; CHECK-NEXT: v_add3_u32 v5, v5, v4, s6 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v5, v[2:3] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 -; CHECK-NEXT: v_and_b32_e32 v6, 1, v5 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v0, v5, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[2:3]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v5, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v6, v5, v6 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s8 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CHECK-NEXT: v_perm_b32 v10, v0, v4, s9 +; CHECK-NEXT: v_perm_b32 v10, v0, v4, s7 ; CHECK-NEXT: global_store_dwordx4 v[16:17], v[10:13], off ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] @@ -358,260 +376,276 @@ define void @v16(<16 x double> %num, ptr addrspace(1) %p) { ; CHECK-LABEL: v16: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 -; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 -; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CHECK-NEXT: v_cvt_f32_f64_e32 v36, v[12:13] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[34:35], v36 -; CHECK-NEXT: v_and_b32_e32 v37, 1, v36 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[12:13]|, |v[34:35]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[12:13], v[34:35] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v37 -; CHECK-NEXT: v_cndmask_b32_e64 v34, -1, 1, s[6:7] -; CHECK-NEXT: v_add_u32_e32 v34, v36, v34 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v34, v34, v36, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[4:5], |v[12:13]|, |v[34:35]| +; CHECK-NEXT: v_cndmask_b32_e64 v34, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v34, v36, 1, v34 +; CHECK-NEXT: v_cndmask_b32_e64 v37, -1, 1, s[4:5] +; CHECK-NEXT: v_and_b32_e32 v34, 1, v34 +; CHECK-NEXT: v_add_u32_e32 v37, v36, v37 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v34 +; CHECK-NEXT: v_cndmask_b32_e32 v34, v37, v36, vcc ; CHECK-NEXT: v_bfe_u32 v35, v34, 16, 1 -; CHECK-NEXT: s_movk_i32 s6, 0x7fff -; CHECK-NEXT: v_add3_u32 v35, v35, v34, s6 +; CHECK-NEXT: s_movk_i32 s4, 0x7fff +; CHECK-NEXT: v_add3_u32 v35, v35, v34, s4 ; CHECK-NEXT: v_or_b32_e32 v34, 0x400000, v34 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[12:13], v[12:13] +; CHECK-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8 +; CHECK-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4 +; CHECK-NEXT: buffer_load_dword v31, off, s[0:3], s32 ; CHECK-NEXT: v_cndmask_b32_e32 v34, v35, v34, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v35, v[14:15] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[12:13], v35 -; CHECK-NEXT: v_and_b32_e32 v36, 1, v35 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[14:15]|, |v[12:13]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[14:15], v[12:13] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v36 -; CHECK-NEXT: v_cndmask_b32_e64 v12, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v12, v35, v12 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v12, v12, v35, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[14:15]|, |v[12:13]| +; CHECK-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v12, v35, 1, v12 +; CHECK-NEXT: v_cndmask_b32_e64 v36, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v12, 1, v12 +; CHECK-NEXT: v_add_u32_e32 v36, v35, v36 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; CHECK-NEXT: v_cndmask_b32_e32 v12, v36, v35, vcc ; CHECK-NEXT: v_bfe_u32 v13, v12, 16, 1 -; CHECK-NEXT: v_add3_u32 v13, v13, v12, s6 +; CHECK-NEXT: v_add3_u32 v13, v13, v12, s4 ; CHECK-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[14:15], v[14:15] ; CHECK-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc -; CHECK-NEXT: s_mov_b32 s7, 0x7060302 -; CHECK-NEXT: v_perm_b32 v13, v12, v34, s7 +; CHECK-NEXT: s_mov_b32 s5, 0x7060302 +; CHECK-NEXT: v_perm_b32 v13, v12, v34, s5 ; CHECK-NEXT: v_cvt_f32_f64_e32 v12, v[8:9] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[14:15], v12 -; CHECK-NEXT: v_and_b32_e32 v34, 1, v12 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[8:9]|, |v[14:15]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[8:9], v[14:15] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v34 -; CHECK-NEXT: v_cndmask_b32_e64 v14, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v14, v12, v14 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[8:9]|, |v[14:15]| +; CHECK-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v14, v12, 1, v14 +; CHECK-NEXT: v_cndmask_b32_e64 v34, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v14, 1, v14 +; CHECK-NEXT: v_add_u32_e32 v34, v12, v34 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; CHECK-NEXT: v_cndmask_b32_e32 v12, v34, v12, vcc ; CHECK-NEXT: v_bfe_u32 v14, v12, 16, 1 -; CHECK-NEXT: v_add3_u32 v14, v14, v12, s6 +; CHECK-NEXT: v_add3_u32 v14, v14, v12, s4 ; CHECK-NEXT: v_or_b32_e32 v12, 0x400000, v12 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[8:9], v[8:9] ; CHECK-NEXT: v_cndmask_b32_e32 v12, v14, v12, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v14, v[10:11] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v14 -; CHECK-NEXT: v_and_b32_e32 v15, 1, v14 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[10:11]|, |v[8:9]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[10:11], v[8:9] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v15 -; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v8, v14, v8 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v14, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[10:11]|, |v[8:9]| +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v8, v14, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v15, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v8, 1, v8 +; CHECK-NEXT: v_add_u32_e32 v15, v14, v15 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v8, v15, v14, vcc ; CHECK-NEXT: v_bfe_u32 v9, v8, 16, 1 -; CHECK-NEXT: v_add3_u32 v9, v9, v8, s6 +; CHECK-NEXT: v_add3_u32 v9, v9, v8, s4 ; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[10:11], v[10:11] ; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v10, v[4:5] -; CHECK-NEXT: v_perm_b32 v12, v8, v12, s7 +; CHECK-NEXT: v_perm_b32 v12, v8, v12, s5 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[8:9], v10 -; CHECK-NEXT: v_and_b32_e32 v11, 1, v10 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[4:5]|, |v[8:9]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[4:5], v[8:9] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v11 -; CHECK-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v8, v10, v8 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[4:5]|, |v[8:9]| +; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v8, v10, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e64 v11, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v8, 1, v8 +; CHECK-NEXT: v_add_u32_e32 v11, v10, v11 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; CHECK-NEXT: v_cndmask_b32_e32 v8, v11, v10, vcc ; CHECK-NEXT: v_bfe_u32 v9, v8, 16, 1 -; CHECK-NEXT: v_add3_u32 v9, v9, v8, s6 +; CHECK-NEXT: v_add3_u32 v9, v9, v8, s4 ; CHECK-NEXT: v_or_b32_e32 v8, 0x400000, v8 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[4:5], v[4:5] ; CHECK-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v9, v[6:7] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v9 -; CHECK-NEXT: v_and_b32_e32 v10, 1, v9 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[6:7]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[6:7], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v4, v9, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[6:7]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v9, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v10, v9, v10 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v10, v9, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v5, v5, v4, s6 +; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[6:7], v[6:7] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v6, v[0:1] -; CHECK-NEXT: v_perm_b32 v11, v4, v8, s7 +; CHECK-NEXT: v_perm_b32 v11, v4, v8, s5 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; CHECK-NEXT: v_and_b32_e32 v7, 1, v6 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[0:1]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v4, v6, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[0:1]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v6, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v7, v6, v7 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v5, v5, v4, s6 +; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v5, v[2:3] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 -; CHECK-NEXT: v_and_b32_e32 v6, 1, v5 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[2:3]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v5, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[2:3]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v5, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v6, v5, v6 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v2, v[28:29] -; CHECK-NEXT: v_perm_b32 v10, v0, v4, s7 +; CHECK-NEXT: v_perm_b32 v10, v0, v4, s5 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 -; CHECK-NEXT: v_and_b32_e32 v3, 1, v2 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[28:29]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[28:29], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v3 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v2, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[28:29]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v2, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v3, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v3, v2, v3 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[28:29], v[28:29] ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cvt_f32_f64_e32 v3, v[30:31] ; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v3 -; CHECK-NEXT: v_and_b32_e32 v4, 1, v3 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[30:31]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[30:31], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v3, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[30:31]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v3, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v4, v3, v4 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v3, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[30:31], v[30:31] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CHECK-NEXT: v_perm_b32 v3, v0, v2, s7 +; CHECK-NEXT: v_perm_b32 v3, v0, v2, s5 ; CHECK-NEXT: v_cvt_f32_f64_e32 v2, v[24:25] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 -; CHECK-NEXT: v_and_b32_e32 v4, 1, v2 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[24:25]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[24:25], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v4 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v2, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[24:25]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v2, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v4, v2, v4 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[24:25], v[24:25] ; CHECK-NEXT: v_cvt_f32_f64_e32 v4, v[26:27] ; CHECK-NEXT: v_cndmask_b32_e32 v2, v1, v0, vcc ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 -; CHECK-NEXT: v_and_b32_e32 v5, 1, v4 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[26:27]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[26:27], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v4, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[26:27]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v4, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v5, v4, v5 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[26:27], v[26:27] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc ; CHECK-NEXT: v_cvt_f32_f64_e32 v4, v[20:21] -; CHECK-NEXT: v_perm_b32 v2, v0, v2, s7 +; CHECK-NEXT: v_perm_b32 v2, v0, v2, s5 ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v4 -; CHECK-NEXT: v_and_b32_e32 v5, 1, v4 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[20:21]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[20:21], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v5 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v4, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[20:21]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v4, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v5, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v5, v4, v5 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v5, v4, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[20:21], v[20:21] ; CHECK-NEXT: v_cvt_f32_f64_e32 v5, v[22:23] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v1, v0, vcc ; CHECK-NEXT: v_cvt_f64_f32_e32 v[0:1], v5 -; CHECK-NEXT: v_and_b32_e32 v6, 1, v5 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[22:23]|, |v[0:1]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[22:23], v[0:1] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v0, v5, v0 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[22:23]|, |v[0:1]| +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v0, v5, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_add_u32_e32 v6, v5, v6 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v5, vcc ; CHECK-NEXT: v_bfe_u32 v1, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v1, v1, v0, s6 +; CHECK-NEXT: v_add3_u32 v1, v1, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[22:23], v[22:23] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc -; CHECK-NEXT: v_perm_b32 v1, v0, v4, s7 +; CHECK-NEXT: v_perm_b32 v1, v0, v4, s5 ; CHECK-NEXT: v_cvt_f32_f64_e32 v0, v[16:17] ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; CHECK-NEXT: v_and_b32_e32 v6, 1, v0 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[16:17]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[16:17], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v6 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v4, v0, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[16:17]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v0, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v6, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v6, v0, v6 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v0, v6, v0, vcc ; CHECK-NEXT: v_bfe_u32 v4, v0, 16, 1 -; CHECK-NEXT: v_add3_u32 v4, v4, v0, s6 +; CHECK-NEXT: v_add3_u32 v4, v4, v0, s4 ; CHECK-NEXT: v_or_b32_e32 v0, 0x400000, v0 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[16:17], v[16:17] ; CHECK-NEXT: v_cvt_f32_f64_e32 v6, v[18:19] ; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc ; CHECK-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; CHECK-NEXT: v_and_b32_e32 v7, 1, v6 -; CHECK-NEXT: v_cmp_gt_f64_e64 s[8:9], |v[18:19]|, |v[4:5]| ; CHECK-NEXT: v_cmp_nlg_f64_e32 vcc, v[18:19], v[4:5] -; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v7 -; CHECK-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[8:9] -; CHECK-NEXT: v_add_u32_e32 v4, v6, v4 -; CHECK-NEXT: s_or_b64 vcc, vcc, s[4:5] -; CHECK-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc +; CHECK-NEXT: v_cmp_gt_f64_e64 s[6:7], |v[18:19]|, |v[4:5]| +; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; CHECK-NEXT: v_and_or_b32 v4, v6, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e64 v7, -1, 1, s[6:7] +; CHECK-NEXT: v_and_b32_e32 v4, 1, v4 +; CHECK-NEXT: v_add_u32_e32 v7, v6, v7 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; CHECK-NEXT: v_cndmask_b32_e32 v4, v7, v6, vcc ; CHECK-NEXT: v_bfe_u32 v5, v4, 16, 1 -; CHECK-NEXT: v_add3_u32 v5, v5, v4, s6 +; CHECK-NEXT: v_add3_u32 v5, v5, v4, s4 ; CHECK-NEXT: v_or_b32_e32 v4, 0x400000, v4 ; CHECK-NEXT: v_cmp_u_f64_e32 vcc, v[18:19], v[18:19] ; CHECK-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc -; CHECK-NEXT: v_perm_b32 v0, v4, v0, s7 +; CHECK-NEXT: v_perm_b32 v0, v4, v0, s5 ; CHECK-NEXT: global_store_dwordx4 v[32:33], v[0:3], off offset:16 ; CHECK-NEXT: global_store_dwordx4 v[32:33], v[10:13], off ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll index 3465c782bd700..031f7362826e0 100644 --- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll @@ -22,30 +22,34 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc -; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[2:3] +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: s_mov_b32 s7, -1 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB0_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: s_mov_b64 s[6:7], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB0_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -54,36 +58,36 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: v_mul_lo_u32 v14, v10, v6 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v11, v10, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: v_add3_u32 v5, v5, v14, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_mul_lo_u32 v6, v9, v11 ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v8, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v5, v9, v5, v6 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v4 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -93,36 +97,36 @@ define i128 @fptosi_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[1:2] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v9, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB0_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB0_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -386,30 +390,34 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc -; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[2:3] +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: s_mov_b32 s7, -1 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5] -; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB1_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0x432 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: s_mov_b64 s[6:7], 0x432 ; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB1_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -418,36 +426,36 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v7 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7 ; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v3, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, v1 ; SDAG-NEXT: v_mul_lo_u32 v14, v10, v6 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v11, v10, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v11, v10, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr10 ; SDAG-NEXT: v_add3_u32 v5, v5, v14, v13 ; SDAG-NEXT: v_mov_b32_e32 v2, v6 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v9, v12, v[4:5] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v12, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v9, v12, v[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v7, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_mul_lo_u32 v6, v9, v11 ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v11, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v11, v8, v[2:3] ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v5, v9, v5, v6 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v2, v4 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v3, v5, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 ; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5 ; SDAG-NEXT: ; implicit-def: $vgpr9 @@ -457,36 +465,36 @@ define i128 @fptoui_f64_to_i128(double %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 ; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc ; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v10, 0 +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v6, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v10, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v5, v10, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v3 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[1:2] -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v4, v2 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3] -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v6, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v6, v8, v[1:2] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 +; SDAG-NEXT: v_addc_co_u32_e64 v3, s[6:7], 0, 0, vcc +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v8, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v9, v6, v[2:3] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v9, v6, v[3:4] ; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3 ; SDAG-NEXT: .LBB1_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[12:13] ; SDAG-NEXT: .LBB1_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -749,31 +757,35 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[2:3] +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: s_mov_b32 s7, -1 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB2_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: s_mov_b64 s[6:7], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB2_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -782,34 +794,34 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v8, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v12, v8, v[6:7] ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v9, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e32 v3, vcc, v5, v7 +; SDAG-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_mul_lo_u32 v10, v9, v11 ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v8, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v11, v8, v[3:4] ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v5, v9, v2, v10 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v3, v1 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc ; SDAG-NEXT: v_mov_b32_e32 v1, v6 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 @@ -821,29 +833,29 @@ define i128 @fptosi_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB2_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1100,31 +1112,35 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[2:3] +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: s_mov_b32 s7, -1 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB3_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0 -; SDAG-NEXT: s_mov_b64 s[4:5], 0x95 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v9, vcc, -1, v0 +; SDAG-NEXT: s_mov_b64 s[6:7], 0x95 ; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB3_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else @@ -1133,34 +1149,34 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v10, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v10, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v10, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v10, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v10, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v10, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v8, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v12, v8, v[6:7] ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v9, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e32 v3, vcc, v5, v7 +; SDAG-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_mul_lo_u32 v10, v9, v11 ; SDAG-NEXT: v_mul_lo_u32 v9, v9, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v8, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v11, v8, v[3:4] ; SDAG-NEXT: ; implicit-def: $vgpr8 ; SDAG-NEXT: v_add3_u32 v5, v9, v2, v10 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v3, v1 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc ; SDAG-NEXT: v_mov_b32_e32 v1, v6 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 @@ -1172,29 +1188,29 @@ define i128 @fptoui_f32_to_i128(float %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0 +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v6, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[12:13], v3, v10, 0 ; SDAG-NEXT: v_mov_b32_e32 v2, 0 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2] +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[12:13], v3, v8, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v5 -; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2] +; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[12:13], v9, v3, v[1:2] ; SDAG-NEXT: v_mov_b32_e32 v1, v4 ; SDAG-NEXT: .LBB3_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1479,67 +1495,71 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[2:3] +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: s_mov_b32 s7, -1 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i16_e64 s[4:5], -1, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB6_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: s_movk_i32 s4, 0x7f -; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: s_movk_i32 s6, 0x7f +; SDAG-NEXT: v_and_b32_sdwa v0, v4, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: s_mov_b64 s[6:7], 0x85 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB6_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v10, vcc, -1, v0 ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v8, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v8, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v9, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v12, v9, v[6:7] ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v10, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e32 v3, vcc, v5, v7 +; SDAG-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_mul_lo_u32 v8, v10, v11 ; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v11, v9, v[3:4] ; SDAG-NEXT: v_add3_u32 v5, v10, v2, v8 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v3, v1 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc ; SDAG-NEXT: v_mov_b32_e32 v1, v6 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 @@ -1550,10 +1570,10 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 @@ -1561,16 +1581,16 @@ define i128 @fptosi_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: .LBB6_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB6_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB6_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1826,67 +1846,71 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5 ; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc ; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc -; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc +; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[2:3] +; SDAG-NEXT: s_movk_i32 s6, 0xff7f ; SDAG-NEXT: s_mov_b32 s7, -1 -; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], -1, v[2:3] -; SDAG-NEXT: v_cmp_lt_u64_e64 s[6:7], s[6:7], v[0:1] -; SDAG-NEXT: v_cmp_lt_i16_e32 vcc, -1, v4 -; SDAG-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] +; SDAG-NEXT: v_cmp_lt_i16_e64 s[4:5], -1, v4 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SDAG-NEXT: v_and_b32_e32 v0, v2, v0 +; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB7_7 ; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9 -; SDAG-NEXT: s_movk_i32 s4, 0x7f -; SDAG-NEXT: v_and_b32_sdwa v0, v4, s4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; SDAG-NEXT: s_mov_b64 s[4:5], 0x85 -; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6] +; SDAG-NEXT: s_movk_i32 s6, 0x7f +; SDAG-NEXT: v_and_b32_sdwa v0, v4, s6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; SDAG-NEXT: s_mov_b64 s[6:7], 0x85 +; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[5:6] ; SDAG-NEXT: v_mov_b32_e32 v7, 0 -; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, vcc -; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v9, -1, 0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 1, s[4:5] ; SDAG-NEXT: v_or_b32_e32 v6, 0x80, v0 ; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1 ; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3 -; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc ; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7] ; SDAG-NEXT: s_cbranch_execz .LBB7_4 ; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc -; SDAG-NEXT: v_add_co_u32_e64 v10, s[4:5], -1, v0 +; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v10, vcc, -1, v0 ; SDAG-NEXT: v_sub_u32_e32 v0, 0xc6, v5 ; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff3a, v5 ; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff7a, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7] ; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4 -; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4 +; SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4 ; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc ; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7] ; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5] -; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v1, s[4:5] -; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v8, 0 +; SDAG-NEXT: v_cndmask_b32_e32 v12, 0, v0, vcc +; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v1, vcc +; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v12, v8, 0 ; SDAG-NEXT: v_mul_lo_u32 v13, v9, v2 ; SDAG-NEXT: v_mul_lo_u32 v14, v8, v3 ; SDAG-NEXT: v_mov_b32_e32 v6, v1 -; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v8, v[6:7] -; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v8, v2, 0 +; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[6:7], v11, v8, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v8, v2, 0 ; SDAG-NEXT: v_mov_b32_e32 v6, v4 -; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v12, v9, v[6:7] +; SDAG-NEXT: v_mad_u64_u32 v[6:7], s[6:7], v12, v9, v[6:7] ; SDAG-NEXT: v_add3_u32 v3, v3, v14, v13 -; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v10, v12, v[2:3] -; SDAG-NEXT: v_add_co_u32_e64 v3, s[4:5], v5, v7 -; SDAG-NEXT: v_addc_co_u32_e64 v4, s[4:5], 0, 0, s[4:5] +; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v10, v12, v[2:3] +; SDAG-NEXT: v_add_co_u32_e32 v3, vcc, v5, v7 +; SDAG-NEXT: v_addc_co_u32_e64 v4, s[6:7], 0, 0, vcc ; SDAG-NEXT: v_mul_lo_u32 v8, v10, v11 ; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12 -; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v11, v9, v[3:4] +; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v11, v9, v[3:4] ; SDAG-NEXT: v_add3_u32 v5, v10, v2, v8 -; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v3, v1 -; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v4, v5, s[4:5] +; SDAG-NEXT: v_add_co_u32_e32 v2, vcc, v3, v1 +; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, v4, v5, vcc ; SDAG-NEXT: v_mov_b32_e32 v1, v6 ; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6 ; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7 @@ -1897,10 +1921,10 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12 ; SDAG-NEXT: v_sub_u32_e32 v2, 0x86, v5 ; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7] -; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5] -; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2 -; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] +; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v2 +; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc ; SDAG-NEXT: v_mul_hi_i32_i24_e32 v1, v0, v8 ; SDAG-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; SDAG-NEXT: v_mul_i32_i24_e32 v0, v0, v8 @@ -1908,16 +1932,16 @@ define i128 @fptoui_bf16_to_i128(bfloat %x) { ; SDAG-NEXT: .LBB7_6: ; %Flow1 ; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB7_7: ; %Flow2 -; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11] +; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[10:11] ; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5 ; SDAG-NEXT: v_bfrev_b32_e32 v0, 1 ; SDAG-NEXT: v_bfrev_b32_e32 v1, -2 -; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc -; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v1, s[4:5] ; SDAG-NEXT: v_mov_b32_e32 v0, v2 ; SDAG-NEXT: v_mov_b32_e32 v1, v2 ; SDAG-NEXT: ; %bb.9: ; %Flow3 -; SDAG-NEXT: s_or_b64 exec, exec, s[4:5] +; SDAG-NEXT: s_or_b64 exec, exec, s[6:7] ; SDAG-NEXT: .LBB7_10: ; %fp-to-i-cleanup ; SDAG-NEXT: s_or_b64 exec, exec, s[8:9] ; SDAG-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll index 5a5e39489d888..d99142356b545 100644 --- a/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll +++ b/llvm/test/CodeGen/AMDGPU/fptrunc.v2f16.no.fast.path.ll @@ -41,7 +41,7 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { ; GFX950-SDAG-NEXT: v_and_b32_e32 v6, 7, v4 ; GFX950-SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 5, v6 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v4 -; GFX950-SDAG-NEXT: s_mov_b32 s3, 0x8000 +; GFX950-SDAG-NEXT: s_movk_i32 s3, 0x8000 ; GFX950-SDAG-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 3, v6 ; GFX950-SDAG-NEXT: s_nop 1 @@ -59,7 +59,7 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { ; GFX950-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v5 ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX950-SDAG-NEXT: v_and_or_b32 v0, v1, s3, v0 +; GFX950-SDAG-NEXT: v_bitop3_b16 v0, v1, v0, s3 bitop3:0xec ; GFX950-SDAG-NEXT: v_and_or_b32 v1, v3, s0, v2 ; GFX950-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1 ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 8, v3 @@ -100,7 +100,7 @@ define <2 x half> @v_test_cvt_v2f64_v2f16(<2 x double> %src) { ; GFX950-SDAG-NEXT: s_nop 1 ; GFX950-SDAG-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc ; GFX950-SDAG-NEXT: v_lshrrev_b32_e32 v2, 16, v3 -; GFX950-SDAG-NEXT: v_and_or_b32 v1, v2, s3, v1 +; GFX950-SDAG-NEXT: v_bitop3_b16 v1, v2, v1, s3 bitop3:0xec ; GFX950-SDAG-NEXT: v_perm_b32 v0, v1, v0, s0 ; GFX950-SDAG-NEXT: s_setpc_b64 s[30:31] ; diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll index fbe253e95d210..29cd6c782abb5 100644 --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -1229,60 +1229,64 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace( define amdgpu_kernel void @frem_f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[4:5], s[0:1], v[2:3], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 ; SI-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[0:1], v[2:3], v[0:1] +; SI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[12:13], -v[4:5], v[10:11], v[8:9] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v9 -; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: s_nop 1 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v9 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v4, v5, v4 +; SI-NEXT: v_and_b32_e32 v4, 1, v4 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; SI-NEXT: s_nop 3 ; SI-NEXT: v_div_fmas_f64 v[4:5], v[12:13], v[6:7], v[10:11] ; SI-NEXT: v_div_fixup_f64 v[4:5], v[4:5], v[2:3], v[0:1] -; SI-NEXT: v_readfirstlane_b32 s2, v5 -; SI-NEXT: s_bfe_u32 s0, s2, 0xb0014 -; SI-NEXT: s_add_i32 s3, s0, 0xfffffc01 -; SI-NEXT: s_mov_b32 s1, 0xfffff -; SI-NEXT: s_mov_b32 s0, s6 -; SI-NEXT: s_lshr_b64 s[0:1], s[0:1], s3 -; SI-NEXT: v_not_b32_e32 v6, s0 +; SI-NEXT: v_readfirstlane_b32 s6, v5 +; SI-NEXT: s_bfe_u32 s4, s6, 0xb0014 +; SI-NEXT: s_add_i32 s7, s4, 0xfffffc01 +; SI-NEXT: s_mov_b32 s5, 0xfffff +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: v_not_b32_e32 v6, s4 ; SI-NEXT: v_and_b32_e32 v6, v4, v6 -; SI-NEXT: v_not_b32_e32 v7, s1 +; SI-NEXT: v_not_b32_e32 v7, s5 ; SI-NEXT: v_and_b32_e32 v5, v5, v7 -; SI-NEXT: s_and_b32 s0, s2, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s3, 0 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v6, v6, 0, vcc -; SI-NEXT: v_mov_b32_e32 v7, s0 +; SI-NEXT: v_mov_b32_e32 v7, s4 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; SI-NEXT: s_cmp_gt_i32 s3, 51 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_mov_b32_e32 v7, s2 +; SI-NEXT: v_mov_b32_e32 v7, s6 ; SI-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc ; SI-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1] -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_f64: @@ -4114,51 +4118,56 @@ define amdgpu_kernel void @frem_v4f32(ptr addrspace(1) %out, ptr addrspace(1) %i define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %in1, ; SI-LABEL: frem_v2f64: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s7, 0xf000 -; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_mov_b32 s0, s2 -; SI-NEXT: s_mov_b32 s1, s3 -; SI-NEXT: s_mov_b32 s2, s6 -; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_mov_b32 s10, s6 -; SI-NEXT: s_mov_b32 s11, s7 -; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[0:3], 0 -; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[8:11], 0 offset:64 +; SI-NEXT: s_mov_b32 s0, s8 +; SI-NEXT: s_mov_b32 s1, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_mov_b32 s6, s2 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: buffer_load_dwordx4 v[0:3], off, s[8:11], 0 +; SI-NEXT: buffer_load_dwordx4 v[4:7], off, s[4:7], 0 offset:64 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_div_scale_f64 v[8:9], s[0:1], v[6:7], v[6:7], v[2:3] +; SI-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[6:7], v[6:7], v[2:3] ; SI-NEXT: v_rcp_f64_e32 v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] ; SI-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], 1.0 ; SI-NEXT: v_fma_f64 v[10:11], v[10:11], v[12:13], v[10:11] -; SI-NEXT: v_div_scale_f64 v[12:13], s[0:1], v[2:3], v[6:7], v[2:3] +; SI-NEXT: v_div_scale_f64 v[12:13], s[4:5], v[2:3], v[6:7], v[2:3] ; SI-NEXT: v_mul_f64 v[14:15], v[12:13], v[10:11] ; SI-NEXT: v_fma_f64 v[16:17], -v[8:9], v[14:15], v[12:13] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v7, v9 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v3, v13 -; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: s_nop 1 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v3, v13 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v8, v9, v8 +; SI-NEXT: v_and_b32_e32 v8, 1, v8 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; SI-NEXT: s_nop 3 ; SI-NEXT: v_div_fmas_f64 v[8:9], v[16:17], v[10:11], v[14:15] ; SI-NEXT: v_div_fixup_f64 v[8:9], v[8:9], v[6:7], v[2:3] ; SI-NEXT: v_readfirstlane_b32 s8, v9 -; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014 -; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 -; SI-NEXT: s_mov_b32 s3, 0xfffff -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9 -; SI-NEXT: v_not_b32_e32 v10, s0 +; SI-NEXT: s_bfe_u32 s4, s8, 0xb0014 +; SI-NEXT: s_add_i32 s9, s4, 0xfffffc01 +; SI-NEXT: s_mov_b32 s5, 0xfffff +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_lshr_b64 s[6:7], s[4:5], s9 +; SI-NEXT: v_not_b32_e32 v10, s6 ; SI-NEXT: v_and_b32_e32 v10, v8, v10 -; SI-NEXT: v_not_b32_e32 v11, s1 +; SI-NEXT: v_not_b32_e32 v11, s7 ; SI-NEXT: v_and_b32_e32 v9, v9, v11 -; SI-NEXT: s_and_b32 s0, s8, 0x80000000 +; SI-NEXT: s_and_b32 s6, s8, 0x80000000 ; SI-NEXT: s_cmp_lt_i32 s9, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc -; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_mov_b32_e32 v11, s6 ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: s_cmp_gt_i32 s9, 51 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 @@ -4166,42 +4175,46 @@ define amdgpu_kernel void @frem_v2f64(ptr addrspace(1) %out, ptr addrspace(1) %i ; SI-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc ; SI-NEXT: v_cndmask_b32_e32 v8, v10, v8, vcc ; SI-NEXT: v_fma_f64 v[2:3], -v[8:9], v[6:7], v[2:3] -; SI-NEXT: v_div_scale_f64 v[6:7], s[0:1], v[4:5], v[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[6:7], s[6:7], v[4:5], v[4:5], v[0:1] ; SI-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; SI-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] -; SI-NEXT: v_div_scale_f64 v[10:11], s[0:1], v[0:1], v[4:5], v[0:1] +; SI-NEXT: v_div_scale_f64 v[10:11], s[6:7], v[0:1], v[4:5], v[0:1] ; SI-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] ; SI-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] ; SI-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], v1, v11 -; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc -; SI-NEXT: s_nop 1 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; SI-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v6, v7, v6 +; SI-NEXT: v_and_b32_e32 v6, 1, v6 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; SI-NEXT: s_nop 3 ; SI-NEXT: v_div_fmas_f64 v[6:7], v[14:15], v[8:9], v[12:13] ; SI-NEXT: v_div_fixup_f64 v[6:7], v[6:7], v[4:5], v[0:1] -; SI-NEXT: v_readfirstlane_b32 s8, v7 -; SI-NEXT: s_bfe_u32 s0, s8, 0xb0014 -; SI-NEXT: s_add_i32 s9, s0, 0xfffffc01 -; SI-NEXT: s_lshr_b64 s[0:1], s[2:3], s9 -; SI-NEXT: v_not_b32_e32 v8, s0 +; SI-NEXT: v_readfirstlane_b32 s6, v7 +; SI-NEXT: s_bfe_u32 s7, s6, 0xb0014 +; SI-NEXT: s_addk_i32 s7, 0xfc01 +; SI-NEXT: s_lshr_b64 s[4:5], s[4:5], s7 +; SI-NEXT: v_not_b32_e32 v8, s4 ; SI-NEXT: v_and_b32_e32 v8, v6, v8 -; SI-NEXT: v_not_b32_e32 v9, s1 +; SI-NEXT: v_not_b32_e32 v9, s5 ; SI-NEXT: v_and_b32_e32 v7, v7, v9 -; SI-NEXT: s_and_b32 s0, s8, 0x80000000 -; SI-NEXT: s_cmp_lt_i32 s9, 0 +; SI-NEXT: s_and_b32 s4, s6, 0x80000000 +; SI-NEXT: s_cmp_lt_i32 s7, 0 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 ; SI-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; SI-NEXT: v_mov_b32_e32 v9, s0 +; SI-NEXT: v_mov_b32_e32 v9, s4 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc -; SI-NEXT: s_cmp_gt_i32 s9, 51 +; SI-NEXT: s_cmp_gt_i32 s7, 51 ; SI-NEXT: s_cselect_b64 vcc, -1, 0 -; SI-NEXT: v_mov_b32_e32 v9, s8 +; SI-NEXT: v_mov_b32_e32 v9, s6 ; SI-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc ; SI-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc ; SI-NEXT: v_fma_f64 v[0:1], -v[6:7], v[4:5], v[0:1] -; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; ; CI-LABEL: frem_v2f64: diff --git a/llvm/test/CodeGen/AMDGPU/function-args.ll b/llvm/test/CodeGen/AMDGPU/function-args.ll index 81b8b36180746..66dc3be45aadb 100644 --- a/llvm/test/CodeGen/AMDGPU/function-args.ll +++ b/llvm/test/CodeGen/AMDGPU/function-args.ll @@ -102,8 +102,9 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; CIGFX89-LABEL: i1_arg_i1_use: ; CIGFX89: ; %bb.0: ; %bb ; CIGFX89-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; CIGFX89-NEXT: v_not_b32_e32 v0, v0 ; CIGFX89-NEXT: v_and_b32_e32 v0, 1, v0 -; CIGFX89-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; CIGFX89-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CIGFX89-NEXT: s_and_saveexec_b64 s[4:5], vcc ; CIGFX89-NEXT: s_cbranch_execz .LBB3_2 ; CIGFX89-NEXT: ; %bb.1: ; %bb1 @@ -119,10 +120,11 @@ define void @i1_arg_i1_use(i1 %arg) #0 { ; GFX11-LABEL: i1_arg_i1_use: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_not_b32_e32 v0, v0 ; GFX11-NEXT: s_mov_b32 s0, exec_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_cmpx_ne_u32_e32 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmpx_eq_u32_e32 1, v0 ; GFX11-NEXT: s_cbranch_execz .LBB3_2 ; GFX11-NEXT: ; %bb.1: ; %bb1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll index 56ceba258f471..31326dee76b9f 100644 --- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll +++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll @@ -51,7 +51,7 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: image_sample_lz v3, v[2:3], s[16:23], s[68:71] dmask:0x1 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane -; CHECK-NEXT: s_mov_b32 s6, 48 +; CHECK-NEXT: s_mov_b32 s4, 48 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_writelane_b32 v6, s36, 0 ; CHECK-NEXT: v_writelane_b32 v6, s37, 1 @@ -69,16 +69,17 @@ define void @main(i1 %arg) #0 { ; CHECK-NEXT: v_writelane_b32 v6, s48, 12 ; CHECK-NEXT: v_writelane_b32 v6, s49, 13 ; CHECK-NEXT: v_writelane_b32 v6, s50, 14 -; CHECK-NEXT: s_movk_i32 s56, 0x1f0 ; CHECK-NEXT: s_movk_i32 s72, 0x2f0 -; CHECK-NEXT: s_mov_b32 s57, s24 +; CHECK-NEXT: s_mov_b32 s5, s24 ; CHECK-NEXT: s_mov_b32 s73, s24 ; CHECK-NEXT: v_writelane_b32 v6, s51, 15 -; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0 -; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0 -; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: s_movk_i32 s6, 0x1f0 +; CHECK-NEXT: s_load_dwordx8 s[24:31], s[4:5], 0x0 +; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0 ; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0 +; CHECK-NEXT: v_not_b32_e32 v0, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll index 50f0a39802270..61249fb19717f 100644 --- a/llvm/test/CodeGen/AMDGPU/idot8u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -81,32 +81,32 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v14, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX8-NEXT: v_bfe_u32 v7, v0, 4, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v8, v15, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v7, v14, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v6, v13, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v5, v12, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v4, v11, v0 -; GFX8-NEXT: v_mad_u32_u24 v0, v2, v10, v0 -; GFX8-NEXT: v_mad_u32_u24 v2, v1, v9, v0 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, s0 +; GFX8-NEXT: v_bfe_u32 v9, v0, 8, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v6, v7, v4 +; GFX8-NEXT: v_bfe_u32 v11, v0, 12, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v8, v9, v4 +; GFX8-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v10, v11, v4 +; GFX8-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v12, v13, v4 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 +; GFX8-NEXT: v_bfe_u32 v3, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v14, v15, v4 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v4 +; GFX8-NEXT: v_mad_u32_u24 v2, v1, v2, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -129,36 +129,36 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX9-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v2 -; GFX9-NEXT: v_bfe_u32 v11, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v12, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX9-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v11, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v13, v1, 16, 4 +; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 +; GFX9-NEXT: v_bfe_u32 v15, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v6 +; GFX9-NEXT: v_mul_u32_u24_e32 v6, v7, v8 +; GFX9-NEXT: v_mul_u32_u24_e32 v7, v9, v10 +; GFX9-NEXT: v_mul_u32_u24_e32 v8, v11, v12 ; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v2, v9, v16 -; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_add3_u32 v1, v1, s0, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-NEXT: v_add3_u32 v1, v1, v8, v7 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, v3, v10 -; GFX9-NEXT: v_add3_u32 v1, v1, v6, v5 -; GFX9-NEXT: v_add3_u32 v1, v1, v4, v3 +; GFX9-NEXT: v_add3_u32 v3, v5, s0, v6 +; GFX9-NEXT: v_mul_u32_u24_e32 v9, v13, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v10, v15, v16 +; GFX9-NEXT: v_add3_u32 v3, v3, v7, v8 +; GFX9-NEXT: v_add3_u32 v3, v3, v9, v10 +; GFX9-NEXT: v_add3_u32 v1, v3, v1, v2 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -337,8 +337,8 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -351,37 +351,46 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_and_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_and_b32_sdwa v14, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -390,44 +399,53 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -436,44 +454,53 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -483,7 +510,6 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 @@ -492,37 +518,47 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ushort v4, v1, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v7, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v9 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v10, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_short v1, v0, s[6:7] +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v4, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { @@ -655,8 +691,8 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -669,37 +705,46 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_and_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_and_b32_sdwa v14, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -708,44 +753,53 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -754,44 +808,53 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -801,7 +864,6 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 @@ -810,38 +872,54 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 24, 4 +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v7, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v9, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 20, v2 +; GFX10-DL-NEXT: v_and_b32_sdwa v10, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v9 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v8, v10, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v4, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -974,8 +1052,8 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -988,37 +1066,44 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1028,44 +1113,51 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm @@ -1075,44 +1167,51 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm @@ -1123,7 +1222,6 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 @@ -1132,39 +1230,59 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v7, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v9, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v2 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v4, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm + + + + + + + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1281,8 +1399,8 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -1295,37 +1413,44 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -1335,44 +1460,51 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm @@ -1382,44 +1514,51 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] ; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] +; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 28, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 24, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 4, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 24, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 4, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v12, 28, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 24, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 20, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 4, v2 ; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v17 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v1, v2, v3 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v17, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v16, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v11, v17, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v10, v16, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v9, v15, v1 +; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v8, v4, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v7, v14, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v6, v13, v1 ; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v5, v12, v1 -; GFX9-DL-NEXT: v_mad_legacy_u16 v1, v4, v11, v1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm @@ -1430,7 +1569,6 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 @@ -1439,39 +1577,59 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v2, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v3, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ubyte v4, v1, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v3 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 4, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v4 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_mad_u16 v0, v0, v5, v4 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 8, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 24, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v7, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 12, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v6, v3 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v8 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v9, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v2 +; GFX10-DL-NEXT: v_mad_u16 v3, v7, v4, v3 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 24, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 24, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v3 -; GFX10-DL-NEXT: v_mad_u16 v0, v6, v7, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v4, v5, v0 -; GFX10-DL-NEXT: v_mad_u16 v0, v2, v3, v0 -; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v0 -; GFX10-DL-NEXT: global_store_byte v1, v0, s[6:7] +; GFX10-DL-NEXT: v_mad_u16 v3, v5, v6, v3 +; GFX10-DL-NEXT: v_mad_u16 v3, v4, v7, v3 +; GFX10-DL-NEXT: v_mad_u16 v1, v1, v2, v3 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm + + + + + + + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -1604,34 +1762,34 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 -; GFX8-NEXT: v_bfe_u32 v2, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v4, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v5, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v6, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 +; GFX8-NEXT: v_and_b32_e32 v4, 15, v3 +; GFX8-NEXT: v_bfe_u32 v6, v3, 4, 4 +; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 +; GFX8-NEXT: v_bfe_u32 v10, v3, 12, 4 +; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX8-NEXT: v_bfe_u32 v14, v3, 20, 4 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v0 -; GFX8-NEXT: v_bfe_u32 v10, v0, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v0, 20, 4 -; GFX8-NEXT: v_bfe_u32 v12, v0, 16, 4 -; GFX8-NEXT: v_bfe_u32 v13, v0, 12, 4 -; GFX8-NEXT: v_bfe_u32 v14, v0, 8, 4 -; GFX8-NEXT: v_bfe_u32 v15, v0, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v0, 15, v0 +; GFX8-NEXT: v_and_b32_e32 v5, 15, v0 +; GFX8-NEXT: v_bfe_u32 v7, v0, 4, 4 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mad_u32_u24 v16, v3, v0, s0 -; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v16 -; GFX8-NEXT: v_mad_u32_u24 v3, v8, v15, v16 -; GFX8-NEXT: v_mad_u32_u24 v3, v7, v14, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, v6, v13, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, v5, v12, v3 -; GFX8-NEXT: v_mad_u32_u24 v3, v4, v11, v3 -; GFX8-NEXT: v_mad_u32_u24 v2, v2, v10, v3 -; GFX8-NEXT: v_mad_u32_u24 v1, v1, v9, v2 -; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v1 +; GFX8-NEXT: v_mad_u32_u24 v16, v4, v5, s0 +; GFX8-NEXT: v_bfe_u32 v9, v0, 8, 4 +; GFX8-NEXT: v_mad_u32_u24 v4, v4, v5, v16 +; GFX8-NEXT: v_mad_u32_u24 v5, v6, v7, v16 +; GFX8-NEXT: v_bfe_u32 v11, v0, 12, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v8, v9, v5 +; GFX8-NEXT: v_bfe_u32 v13, v0, 16, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v10, v11, v5 +; GFX8-NEXT: v_bfe_u32 v15, v0, 20, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v12, v13, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v0 +; GFX8-NEXT: v_bfe_u32 v3, v3, 24, 4 +; GFX8-NEXT: v_bfe_u32 v0, v0, 24, 4 +; GFX8-NEXT: v_mad_u32_u24 v5, v14, v15, v5 +; GFX8-NEXT: v_mad_u32_u24 v0, v3, v0, v5 +; GFX8-NEXT: v_mad_u32_u24 v0, v1, v2, v0 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 @@ -1654,37 +1812,37 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v3, v1, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX9-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX9-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX9-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX9-NEXT: v_bfe_u32 v11, v1, 12, 4 +; GFX9-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX9-NEXT: v_bfe_u32 v13, v1, 16, 4 ; GFX9-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-NEXT: v_mul_u32_u24_e32 v17, v1, v2 +; GFX9-NEXT: v_bfe_u32 v15, v1, 20, 4 +; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mad_u32_u24 v1, v1, v2, s0 -; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX9-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-NEXT: v_mad_u32_u24 v2, v3, v10, v1 -; GFX9-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX9-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-NEXT: v_add3_u32 v2, v2, v9, v8 -; GFX9-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-NEXT: v_add3_u32 v2, v2, v7, v6 -; GFX9-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-NEXT: v_add3_u32 v1, v17, v1, v2 +; GFX9-NEXT: v_mad_u32_u24 v3, v5, v6, s0 +; GFX9-NEXT: v_mul_u32_u24_e32 v9, v9, v10 +; GFX9-NEXT: v_mul_u32_u24_e32 v10, v11, v12 +; GFX9-NEXT: v_mad_u32_u24 v4, v7, v8, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v11, v13, v14 +; GFX9-NEXT: v_mul_u32_u24_e32 v12, v15, v16 +; GFX9-NEXT: v_add3_u32 v4, v4, v9, v10 +; GFX9-NEXT: v_add3_u32 v4, v4, v11, v12 +; GFX9-NEXT: v_mul_u32_u24_e32 v17, v5, v6 +; GFX9-NEXT: v_add3_u32 v1, v4, v1, v2 +; GFX9-NEXT: v_add3_u32 v1, v17, v3, v1 ; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_endpgm ; @@ -1705,37 +1863,37 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v3, v1, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 20, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v4, 28, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v2 +; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v8, v2, 4, 4 +; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v10, v2, 8, 4 +; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX9-DL-NEXT: v_bfe_u32 v13, v1, 16, 4 ; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 8, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v2, 15, v2 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v1, v2 +; GFX9-DL-NEXT: v_bfe_u32 v15, v1, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 +; GFX9-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 +; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mad_u32_u24 v1, v1, v2, s0 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v16 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v8, v8, v15 -; GFX9-DL-NEXT: v_mad_u32_u24 v2, v3, v10, v1 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v14 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v13 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v9, v8 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_mul_u32_u24_e32 v4, v4, v11 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v7, v6 -; GFX9-DL-NEXT: v_add3_u32 v2, v2, v5, v4 -; GFX9-DL-NEXT: v_add3_u32 v1, v17, v1, v2 +; GFX9-DL-NEXT: v_mad_u32_u24 v3, v5, v6, s0 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v9, v9, v10 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v10, v11, v12 +; GFX9-DL-NEXT: v_mad_u32_u24 v4, v7, v8, v3 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v11, v13, v14 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v12, v15, v16 +; GFX9-DL-NEXT: v_add3_u32 v4, v4, v9, v10 +; GFX9-DL-NEXT: v_add3_u32 v4, v4, v11, v12 +; GFX9-DL-NEXT: v_mul_u32_u24_e32 v17, v5, v6 +; GFX9-DL-NEXT: v_add3_u32 v1, v4, v1, v2 +; GFX9-DL-NEXT: v_add3_u32 v1, v17, v3, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-DL-NEXT: s_endpgm ; @@ -1758,40 +1916,55 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v1 -; GFX10-DL-NEXT: v_bfe_u32 v4, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v2, 12, 4 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 +; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 8, 4 +; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 +; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 12, 4 +; GFX10-DL-NEXT: v_bfe_u32 v11, v2, 12, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v8 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mad_u32_u24 v13, v8, v9, s0 +; GFX10-DL-NEXT: v_mad_u32_u24 v8, v4, v5, s0 +; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 +; GFX10-DL-NEXT: v_bfe_u32 v13, v2, 16, 4 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v10, v10, v11 +; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 ; GFX10-DL-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v11 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v7, v7, v12 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, v0, v10, v13 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v2 +; GFX10-DL-NEXT: v_mad_u32_u24 v6, v6, v9, v8 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v0, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 28, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v9, v12, v13 +; GFX10-DL-NEXT: v_bfe_u32 v1, v1, 24, 4 ; GFX10-DL-NEXT: v_bfe_u32 v2, v2, 24, 4 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v6, v6, v15 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v5, v5, v14 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v7 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v4, v2 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v2, v3, v10 -; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v8, v9 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v6, v5 -; GFX10-DL-NEXT: v_add3_u32 v0, v0, v1, v2 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v11, v11, v14 +; GFX10-DL-NEXT: v_add3_u32 v6, v6, v7, v10 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v0, v0, v3 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v3, v4, v5 +; GFX10-DL-NEXT: v_mul_u32_u24_e32 v1, v1, v2 +; GFX10-DL-NEXT: v_add3_u32 v2, v6, v9, v11 +; GFX10-DL-NEXT: v_add3_u32 v0, v2, v1, v0 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-DL-NEXT: v_add3_u32 v0, v3, v13, v0 +; GFX10-DL-NEXT: v_add3_u32 v0, v3, v8, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[6:7] ; GFX10-DL-NEXT: s_endpgm + + + + + + + + + + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2161,8 +2334,8 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2175,156 +2348,183 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ushort v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_bfe_u32 v6, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_and_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_bfe_u32 v13, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_and_b32_sdwa v14, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc16_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ushort v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-NEXT: v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-NEXT: v_perm_b32 v2, v17, v16, s2 +; GFX9-NEXT: v_perm_b32 v4, v10, v9, s2 +; GFX9-NEXT: v_perm_b32 v9, v15, v14, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc16_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ushort v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ushort v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-DL-NEXT: v_perm_b32 v2, v17, v16, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v10, v9, s2 +; GFX9-DL-NEXT: v_perm_b32 v9, v15, v14, s2 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: global_store_short v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_short v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc16_vecMul: @@ -2333,6 +2533,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 @@ -2346,50 +2547,76 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ushort v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX10-DL-NEXT: v_perm_b32 v5, v7, v5, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v6, v6, v10, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX10-DL-NEXT: v_perm_b32 v8, v10, v8, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v7, v9, v7, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 +; GFX10-DL-NEXT: v_perm_b32 v6, v6, v11, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v8, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: global_store_short v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm + + + + + + + + + + + + + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2485,8 +2712,8 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2499,260 +2726,311 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 28, v3 -; GFX8-NEXT: v_bfe_u32 v10, v3, 24, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v7, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v12, v3, 16, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 4, v3 +; GFX8-NEXT: v_and_b32_e32 v7, 15, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v11, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v12, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 28, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v16, 28, v2 -; GFX8-NEXT: v_bfe_u32 v17, v2, 24, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v19, v2, 16, 4 -; GFX8-NEXT: v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_mul_lo_u16_e32 v18, v10, v17 -; GFX8-NEXT: v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_bfe_u32 v5, v3, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v6, 15, v3 -; GFX8-NEXT: v_bfe_u32 v3, v2, 4, 4 -; GFX8-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX8-NEXT: v_mul_lo_u16_e32 v2, v12, v19 -; GFX8-NEXT: v_mul_lo_u16_e32 v8, v8, v15 -; GFX8-NEXT: v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v9, v18, v9 -; GFX8-NEXT: v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v3, v2, v11 -; GFX8-NEXT: v_or_b32_e32 v7, v8, v7 -; GFX8-NEXT: v_lshlrev_b32_e32 v8, 16, v9 -; GFX8-NEXT: v_mul_lo_u16_e32 v6, v6, v13 -; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v7 -; GFX8-NEXT: v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX8-NEXT: v_or_b32_e32 v6, v6, v5 -; GFX8-NEXT: v_or_b32_e32 v5, v5, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v8, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX8-NEXT: v_and_b32_e32 v14, 15, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v18, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 28, v2 +; GFX8-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX8-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v3, v12, v5 +; GFX8-NEXT: v_mul_lo_u16_e32 v19, v11, v18 +; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_mul_lo_u16_e32 v9, v9, v16 +; GFX8-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX8-NEXT: v_lshlrev_b32_e32 v10, 16, v2 +; GFX8-NEXT: v_or_b32_e32 v3, v19, v3 +; GFX8-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX8-NEXT: v_mul_lo_u16_e32 v7, v7, v14 +; GFX8-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v2 +; GFX8-NEXT: v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v8 +; GFX8-NEXT: v_or_b32_e32 v7, v7, v6 +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 8, v3 +; GFX8-NEXT: v_or_b32_e32 v6, v6, v2 ; GFX8-NEXT: v_lshrrev_b64 v[2:3], 24, v[2:3] -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 8, v5 +; GFX8-NEXT: v_lshrrev_b32_e32 v3, 8, v6 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_add_u16_e32 v3, v6, v4 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v5 -; GFX8-NEXT: v_add_u16_e32 v3, v3, v7 +; GFX8-NEXT: v_add_u16_e32 v4, v7, v4 +; GFX8-NEXT: v_add_u16_e32 v3, v4, v3 +; GFX8-NEXT: v_add_u16_e32 v3, v3, v8 ; GFX8-NEXT: v_add_u16_e32 v2, v3, v2 -; GFX8-NEXT: v_mad_u16 v2, v12, v19, v2 -; GFX8-NEXT: v_add_u16_e32 v2, v2, v8 -; GFX8-NEXT: v_lshrrev_b32_e32 v9, 8, v9 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 +; GFX8-NEXT: v_add_u16_e32 v2, v2, v10 +; GFX8-NEXT: v_mad_u16 v2, v12, v5, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v2, v9 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc8_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: global_load_ubyte v4, v3, s[6:7] ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: global_load_ubyte v4, v3, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX9-NEXT: v_bfe_u32 v9, v1, 24, 4 -; GFX9-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v11, 20, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_bfe_u32 v1, v2, 4, 4 -; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX9-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX9-NEXT: v_mul_lo_u16_e32 v18, v11, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v18, 20, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX9-NEXT: v_and_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 28, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_and_b32_e32 v0, 15, v5 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v11 +; GFX9-NEXT: v_and_b32_e32 v11, 15, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v15 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v18 +; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-NEXT: v_mul_lo_u16_e32 v18, v12, v2 ; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v17, v9, v16 ; GFX9-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_mul_lo_u16_e32 v7, v7, v14 -; GFX9-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v0, v18, v10 -; GFX9-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v8 -; GFX9-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-NEXT: v_or_b32_e32 v10, v12, v0 +; GFX9-NEXT: v_mul_lo_u16_sdwa v5, v5, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_e32 v6, v6, v13 +; GFX9-NEXT: v_or_b32_sdwa v13, v17, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_mul_lo_u16_sdwa v11, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_or_b32_e32 v0, v18, v8 +; GFX9-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX9-NEXT: v_or_b32_sdwa v1, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX9-NEXT: v_or_b32_e32 v6, v6, v11 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_or_b32_e32 v8, v11, v0 ; GFX9-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX9-NEXT: v_add_u16_e32 v4, v6, v4 ; GFX9-NEXT: v_add_u16_e32 v1, v4, v1 -; GFX9-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 -; GFX9-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-NEXT: v_mad_legacy_u16 v0, v12, v2, v0 ; GFX9-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-NEXT: global_store_byte v3, v0, s[6:7] +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX9-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-NEXT: v_add_u16_e32 v0, v0, v10 +; GFX9-NEXT: global_store_byte v3, v0, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc8_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[6:7] ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v3, 0 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: global_load_ubyte v4, v3, s[0:1] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v11, v1, 16, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_bfe_u32 v1, v2, 4, 4 -; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 12, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 8, 4 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 28, v2 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v2, v2, 16, 4 -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v11, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 12, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v18, 20, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_and_b32_e32 v0, 15, v5 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v11 +; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v18 +; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v18, v12, v2 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v17, v9, v16 ; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-DL-NEXT: v_mul_lo_u16_e32 v7, v7, v14 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_mul_lo_u16_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v10 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-DL-NEXT: v_or_b32_e32 v6, v7, v6 -; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX9-DL-NEXT: v_or_b32_e32 v5, v5, v12 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v8 -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 -; GFX9-DL-NEXT: v_or_b32_e32 v10, v12, v0 +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v5, v5, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_e32 v6, v6, v13 +; GFX9-DL-NEXT: v_or_b32_sdwa v13, v17, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_mul_lo_u16_sdwa v11, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-DL-NEXT: v_or_b32_e32 v0, v18, v8 +; GFX9-DL-NEXT: v_or_b32_e32 v5, v7, v5 +; GFX9-DL-NEXT: v_or_b32_sdwa v1, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v5 +; GFX9-DL-NEXT: v_or_b32_e32 v6, v6, v11 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_or_b32_e32 v8, v11, v0 ; GFX9-DL-NEXT: v_lshrrev_b64 v[0:1], 24, v[0:1] -; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v10 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 ; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v4, v5, v4 +; GFX9-DL-NEXT: v_add_u16_e32 v4, v6, v4 ; GFX9-DL-NEXT: v_add_u16_e32 v1, v4, v1 -; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v6 +; GFX9-DL-NEXT: v_add_u16_e32 v1, v1, v5 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v1, v0 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v11, v2, v0 -; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v8 -; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v12, v2, v0 ; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v7 -; GFX9-DL-NEXT: global_store_byte v3, v0, s[6:7] +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 8, v10 +; GFX9-DL-NEXT: v_mad_legacy_u16 v0, v9, v16, v0 +; GFX9-DL-NEXT: v_add_u16_e32 v0, v0, v10 +; GFX9-DL-NEXT: global_store_byte v3, v0, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc8_vecMul: ; GFX10-DL: ; %bb.0: ; %entry -; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 ; GFX10-DL-NEXT: s_mov_b32 s15, 0x31c16000 ; GFX10-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX10-DL-NEXT: s_clause 0x1 +; GFX10-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX10-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: s_clause 0x1 -; GFX10-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX10-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[6:7] +; GFX10-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX10-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX10-DL-NEXT: global_load_ubyte v3, v4, s[0:1] +; GFX10-DL-NEXT: v_mov_b32_e32 v0, 15 ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 12, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 4, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 12, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v10, 28, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v11, 20, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v0, v1, 4, 4 -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 8, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 28, v1 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 -; GFX10-DL-NEXT: v_bfe_u32 v11, v1, 20, 4 -; GFX10-DL-NEXT: v_bfe_u32 v12, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v1, v2, 8, 4 -; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v9 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 4, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v14, 28, v2 -; GFX10-DL-NEXT: v_bfe_u32 v15, v2, 20, 4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 12, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v15, 8, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v5 +; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v14, 15, v2 ; GFX10-DL-NEXT: v_mul_lo_u16 v1, v7, v1 -; GFX10-DL-NEXT: v_lshlrev_b16 v6, 8, v6 -; GFX10-DL-NEXT: v_and_b32_e32 v13, 15, v2 -; GFX10-DL-NEXT: v_mul_lo_u16 v0, v0, v9 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v16, v2, 24, 4 -; GFX10-DL-NEXT: v_or_b32_e32 v6, v1, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v2, v11, v15 -; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v14 -; GFX10-DL-NEXT: v_lshlrev_b16 v9, 8, v0 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 20, v2 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v8, v15 +; GFX10-DL-NEXT: v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v17, 28, v2 +; GFX10-DL-NEXT: v_lshlrev_b16 v1, 8, v1 +; GFX10-DL-NEXT: v_and_b32_sdwa v15, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v0, 15, v11 +; GFX10-DL-NEXT: v_and_b32_e32 v2, 15, v7 ; GFX10-DL-NEXT: v_mul_lo_u16 v5, v5, v13 -; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v6 -; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v7 -; GFX10-DL-NEXT: v_mul_lo_u16 v11, v10, v16 -; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-NEXT: v_or_b32_e32 v7, v8, v1 +; GFX10-DL-NEXT: v_mul_lo_u16 v8, v10, v17 +; GFX10-DL-NEXT: v_mul_lo_u16 v6, v6, v14 +; GFX10-DL-NEXT: v_mul_lo_u16 v2, v0, v2 +; GFX10-DL-NEXT: v_lshlrev_b16 v5, 8, v5 +; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 16, v7 +; GFX10-DL-NEXT: v_mul_lo_u16 v1, v12, v15 +; GFX10-DL-NEXT: v_mul_lo_u16 v10, v9, v16 ; GFX10-DL-NEXT: v_lshlrev_b16 v8, 8, v8 -; GFX10-DL-NEXT: v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_or_b32_e32 v5, v5, v9 +; GFX10-DL-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-DL-NEXT: v_or_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_or_b32_e32 v5, v6, v5 +; GFX10-DL-NEXT: v_or_b32_sdwa v6, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-DL-NEXT: v_or_b32_e32 v1, v1, v2 -; GFX10-DL-NEXT: v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 8, v13 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 8, v11 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 -; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v9 +; GFX10-DL-NEXT: v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-DL-NEXT: v_add_nc_u16 v5, v3, v2 ; GFX10-DL-NEXT: v_lshrrev_b64 v[2:3], 24, v[0:1] ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v6 +; GFX10-DL-NEXT: v_add_nc_u16 v0, v5, v7 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v2 -; GFX10-DL-NEXT: v_mad_u16 v0, v12, v7, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v12, v15, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 8, v8 -; GFX10-DL-NEXT: v_mad_u16 v0, v10, v16, v0 +; GFX10-DL-NEXT: v_mad_u16 v0, v9, v16, v0 ; GFX10-DL-NEXT: v_add_nc_u16 v0, v0, v1 -; GFX10-DL-NEXT: global_store_byte v4, v0, s[6:7] +; GFX10-DL-NEXT: global_store_byte v4, v0, s[0:1] ; GFX10-DL-NEXT: s_endpgm + + + + + + + + + + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: @@ -2849,8 +3127,8 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, 15 ; GFX8-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s0, v2 @@ -2863,159 +3141,184 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_load_ubyte v4, v[0:1] +; GFX8-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX8-NEXT: s_mov_b32 s14, -1 ; GFX8-NEXT: s_mov_b32 s15, 0xe80000 ; GFX8-NEXT: s_add_u32 s12, s12, s11 ; GFX8-NEXT: s_addc_u32 s13, s13, 0 ; GFX8-NEXT: s_waitcnt vmcnt(2) -; GFX8-NEXT: v_lshrrev_b32_e32 v5, 28, v3 -; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v3 -; GFX8-NEXT: v_bfe_u32 v7, v3, 20, 4 -; GFX8-NEXT: v_bfe_u32 v8, v3, 16, 4 -; GFX8-NEXT: v_bfe_u32 v9, v3, 12, 4 -; GFX8-NEXT: v_bfe_u32 v10, v3, 8, 4 -; GFX8-NEXT: v_bfe_u32 v11, v3, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v6, 28, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v8, 20, v3 +; GFX8-NEXT: v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v10, 12, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v11, 8, v3 +; GFX8-NEXT: v_lshrrev_b32_e32 v12, 4, v3 ; GFX8-NEXT: v_and_b32_e32 v3, 15, v3 ; GFX8-NEXT: s_waitcnt vmcnt(1) -; GFX8-NEXT: v_lshrrev_b32_e32 v12, 28, v2 -; GFX8-NEXT: v_lshrrev_b32_e32 v13, 24, v2 -; GFX8-NEXT: v_bfe_u32 v14, v2, 20, 4 -; GFX8-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX8-NEXT: v_bfe_u32 v16, v2, 12, 4 -; GFX8-NEXT: v_bfe_u32 v17, v2, 8, 4 -; GFX8-NEXT: v_bfe_u32 v18, v2, 4, 4 +; GFX8-NEXT: v_lshrrev_b32_e32 v13, 28, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v14, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v15, 20, v2 +; GFX8-NEXT: v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX8-NEXT: v_lshrrev_b32_e32 v16, 12, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v17, 8, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v18, 4, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 +; GFX8-NEXT: v_and_b32_e32 v12, 15, v12 +; GFX8-NEXT: v_and_b32_e32 v18, 15, v18 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_mad_u16 v2, v3, v2, v4 -; GFX8-NEXT: v_mad_u16 v2, v11, v18, v2 -; GFX8-NEXT: v_mad_u16 v2, v10, v17, v2 -; GFX8-NEXT: v_mad_u16 v2, v9, v16, v2 +; GFX8-NEXT: v_and_b32_e32 v11, 15, v11 +; GFX8-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX8-NEXT: v_mad_u16 v2, v12, v18, v2 +; GFX8-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX8-NEXT: v_and_b32_e32 v16, 15, v16 +; GFX8-NEXT: v_mad_u16 v2, v11, v17, v2 +; GFX8-NEXT: v_mad_u16 v2, v10, v16, v2 +; GFX8-NEXT: v_mad_u16 v2, v9, v5, v2 ; GFX8-NEXT: v_mad_u16 v2, v8, v15, v2 ; GFX8-NEXT: v_mad_u16 v2, v7, v14, v2 ; GFX8-NEXT: v_mad_u16 v2, v6, v13, v2 -; GFX8-NEXT: v_mad_u16 v2, v5, v12, v2 ; GFX8-NEXT: v_and_b32_e32 v2, 15, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm ; ; GFX9-LABEL: udot8_acc4_vecMul: ; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-NEXT: s_mov_b32 s14, -1 ; GFX9-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-NEXT: s_add_u32 s12, s12, s11 +; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-NEXT: s_waitcnt vmcnt(2) -; GFX9-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-NEXT: v_perm_b32 v9, v14, v13, s0 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-NEXT: v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-NEXT: v_perm_b32 v10, v16, v15, s0 -; GFX9-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-NEXT: v_perm_b32 v7, v8, v7, s2 +; GFX9-NEXT: v_perm_b32 v8, v13, v12, s2 +; GFX9-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX9-NEXT: v_perm_b32 v2, v2, v4, s2 +; GFX9-NEXT: v_perm_b32 v1, v1, v11, s2 +; GFX9-NEXT: v_pk_mul_lo_u16 v5, v5, v8 ; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-NEXT: v_perm_b32 v2, v17, v16, s2 +; GFX9-NEXT: v_perm_b32 v4, v10, v9, s2 +; GFX9-NEXT: v_perm_b32 v9, v15, v14, s2 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_add_u16_e32 v3, v5, v3 +; GFX9-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v4, v7, v9 ; GFX9-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-NEXT: s_endpgm ; ; GFX9-DL-LABEL: udot8_acc4_vecMul: ; GFX9-DL: ; %bb.0: ; %entry -; GFX9-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX9-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 -; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX9-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX9-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: global_load_dword v1, v0, s[0:1] -; GFX9-DL-NEXT: global_load_dword v2, v0, s[2:3] -; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[6:7] -; GFX9-DL-NEXT: s_mov_b32 s0, 0x5040100 ; GFX9-DL-NEXT: s_mov_b32 s14, -1 ; GFX9-DL-NEXT: s_mov_b32 s15, 0xe00000 ; GFX9-DL-NEXT: s_add_u32 s12, s12, s11 +; GFX9-DL-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 +; GFX9-DL-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX9-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-DL-NEXT: v_mov_b32_e32 v4, 15 +; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) +; GFX9-DL-NEXT: global_load_dword v1, v0, s[8:9] +; GFX9-DL-NEXT: global_load_dword v2, v0, s[10:11] +; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 +; GFX9-DL-NEXT: global_load_ubyte v3, v0, s[0:1] +; GFX9-DL-NEXT: s_mov_b32 s2, 0x5040100 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt vmcnt(2) -; GFX9-DL-NEXT: v_and_b32_e32 v4, 15, v1 -; GFX9-DL-NEXT: v_bfe_u32 v5, v1, 4, 4 -; GFX9-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v7, v1, 12, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v7, 8, v1 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v8, 12, v1 ; GFX9-DL-NEXT: s_waitcnt vmcnt(1) -; GFX9-DL-NEXT: v_and_b32_e32 v11, 15, v2 -; GFX9-DL-NEXT: v_bfe_u32 v12, v2, 4, 4 -; GFX9-DL-NEXT: v_perm_b32 v6, v7, v6, s0 -; GFX9-DL-NEXT: v_perm_b32 v7, v12, v11, s0 -; GFX9-DL-NEXT: v_perm_b32 v4, v5, v4, s0 -; GFX9-DL-NEXT: v_bfe_u32 v8, v1, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v9, v1, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v13, v2, 8, 4 -; GFX9-DL-NEXT: v_bfe_u32 v14, v2, 12, 4 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v7 -; GFX9-DL-NEXT: v_perm_b32 v8, v9, v8, s0 -; GFX9-DL-NEXT: v_perm_b32 v9, v14, v13, s0 -; GFX9-DL-NEXT: s_waitcnt vmcnt(0) -; GFX9-DL-NEXT: v_add_u16_e32 v3, v4, v3 -; GFX9-DL-NEXT: v_bfe_u32 v10, v1, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v13, 4, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX9-DL-NEXT: v_and_b32_e32 v12, 15, v2 +; GFX9-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX9-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX9-DL-NEXT: v_and_b32_e32 v8, 15, v8 +; GFX9-DL-NEXT: v_and_b32_e32 v13, 15, v13 +; GFX9-DL-NEXT: v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v10, 20, v1 +; GFX9-DL-NEXT: v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX9-DL-NEXT: v_bfe_u32 v15, v2, 16, 4 -; GFX9-DL-NEXT: v_bfe_u32 v16, v2, 20, 4 -; GFX9-DL-NEXT: v_bfe_u32 v17, v2, 24, 4 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v14, 8, v2 +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v15, 12, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-DL-NEXT: v_lshrrev_b32_e32 v17, 20, v2 +; GFX9-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX9-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v9 -; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-DL-NEXT: v_perm_b32 v2, v2, v17, s0 -; GFX9-DL-NEXT: v_perm_b32 v1, v1, v10, s0 -; GFX9-DL-NEXT: v_perm_b32 v10, v16, v15, s0 -; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v5 +; GFX9-DL-NEXT: v_perm_b32 v7, v8, v7, s2 +; GFX9-DL-NEXT: v_perm_b32 v8, v13, v12, s2 +; GFX9-DL-NEXT: v_perm_b32 v5, v6, v5, s2 +; GFX9-DL-NEXT: v_and_b32_e32 v10, 15, v10 +; GFX9-DL-NEXT: v_and_b32_e32 v14, 15, v14 +; GFX9-DL-NEXT: v_and_b32_e32 v15, 15, v15 +; GFX9-DL-NEXT: v_and_b32_e32 v17, 15, v17 +; GFX9-DL-NEXT: v_perm_b32 v2, v2, v4, s2 +; GFX9-DL-NEXT: v_perm_b32 v1, v1, v11, s2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v8 ; GFX9-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v8, v10 +; GFX9-DL-NEXT: v_perm_b32 v2, v17, v16, s2 +; GFX9-DL-NEXT: v_perm_b32 v4, v10, v9, s2 +; GFX9-DL-NEXT: v_perm_b32 v9, v15, v14, s2 +; GFX9-DL-NEXT: s_waitcnt vmcnt(0) +; GFX9-DL-NEXT: v_add_u16_e32 v3, v5, v3 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v2, v4, v2 +; GFX9-DL-NEXT: v_pk_mul_lo_u16 v4, v7, v9 ; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v4 +; GFX9-DL-NEXT: v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v3, v3, v2 ; GFX9-DL-NEXT: v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_add_u16_e32 v2, v2, v1 ; GFX9-DL-NEXT: v_add_u16_sdwa v1, v2, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX9-DL-NEXT: v_and_b32_e32 v1, 15, v1 -; GFX9-DL-NEXT: global_store_byte v0, v1, s[6:7] +; GFX9-DL-NEXT: global_store_byte v0, v1, s[0:1] ; GFX9-DL-NEXT: s_endpgm ; ; GFX10-DL-LABEL: udot8_acc4_vecMul: @@ -3024,6 +3327,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; GFX10-DL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-DL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-DL-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 ; GFX10-DL-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 ; GFX10-DL-NEXT: s_mov_b32 s14, -1 @@ -3037,51 +3341,77 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1, ; GFX10-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-DL-NEXT: global_load_ubyte v3, v0, s[6:7] ; GFX10-DL-NEXT: s_waitcnt vmcnt(2) -; GFX10-DL-NEXT: v_and_b32_e32 v4, 15, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v7, 4, v1 ; GFX10-DL-NEXT: s_waitcnt vmcnt(1) -; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v2 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 4, 4 -; GFX10-DL-NEXT: v_bfe_u32 v8, v2, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v1, 12, 4 -; GFX10-DL-NEXT: v_bfe_u32 v10, v1, 20, 4 -; GFX10-DL-NEXT: v_perm_b32 v5, v6, v5, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v4, v7, v4, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v6, v1, 8, 4 -; GFX10-DL-NEXT: v_bfe_u32 v7, v2, 8, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v4, v5 -; GFX10-DL-NEXT: v_perm_b32 v6, v9, v6, 0x5040100 -; GFX10-DL-NEXT: v_perm_b32 v7, v8, v7, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v5, v1, 16, 4 -; GFX10-DL-NEXT: v_bfe_u32 v9, v2, 20, 4 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 4, v2 +; GFX10-DL-NEXT: v_and_b32_e32 v5, 15, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 8, v1 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v7 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 12, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 8, v2 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v13, 12, v2 +; GFX10-DL-NEXT: v_perm_b32 v5, v7, v5, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v6, v6, v10, 0x5040100 +; GFX10-DL-NEXT: v_and_b32_e32 v7, 15, v8 +; GFX10-DL-NEXT: v_and_b32_e32 v8, 15, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v13 +; GFX10-DL-NEXT: v_and_b32_e32 v9, 15, v9 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v5, v6 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v6, 20, v1 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v12, 20, v2 +; GFX10-DL-NEXT: v_perm_b32 v8, v10, v8, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v7, v9, v7, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v5 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) -; GFX10-DL-NEXT: v_add_nc_u16 v3, v4, v3 -; GFX10-DL-NEXT: v_bfe_u32 v4, v2, 16, 4 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v6, v6, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v5, v3 +; GFX10-DL-NEXT: v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX10-DL-NEXT: v_and_b32_e32 v10, 15, v12 +; GFX10-DL-NEXT: v_and_b32_e32 v6, 15, v6 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v7, v7, v8 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-DL-NEXT: v_perm_b32 v5, v10, v5, 0x5040100 -; GFX10-DL-NEXT: v_bfe_u32 v7, v1, 24, 4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v4, v9, v4, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v8, 16, v6 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v6 -; GFX10-DL-NEXT: v_bfe_u32 v6, v2, 24, 4 +; GFX10-DL-NEXT: v_perm_b32 v6, v6, v11, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v9, 16, v7 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v7 +; GFX10-DL-NEXT: v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v2, 28, v2 -; GFX10-DL-NEXT: v_pk_mul_lo_u16 v4, v5, v4 -; GFX10-DL-NEXT: v_perm_b32 v1, v1, v7, 0x5040100 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v8 -; GFX10-DL-NEXT: v_perm_b32 v2, v2, v6, 0x5040100 -; GFX10-DL-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v4 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v1, 28, v1 +; GFX10-DL-NEXT: v_pk_mul_lo_u16 v5, v6, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v9 +; GFX10-DL-NEXT: v_perm_b32 v2, v2, v4, 0x5040100 +; GFX10-DL-NEXT: v_perm_b32 v1, v1, v8, 0x5040100 +; GFX10-DL-NEXT: v_lshrrev_b32_e32 v4, 16, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v3, v3, v5 ; GFX10-DL-NEXT: v_pk_mul_lo_u16 v1, v1, v2 -; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v5 +; GFX10-DL-NEXT: v_add_nc_u16 v2, v3, v4 ; GFX10-DL-NEXT: v_lshrrev_b32_e32 v3, 16, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v2, v1 ; GFX10-DL-NEXT: v_add_nc_u16 v1, v1, v3 ; GFX10-DL-NEXT: v_and_b32_e32 v1, 15, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[6:7] ; GFX10-DL-NEXT: s_endpgm + + + + + + + + + + + + + + + + + + ptr addrspace(1) %src2, ptr addrspace(1) nocapture %dst) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll index 8196ffae2ca27..aa89974d72709 100644 --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -69,7 +69,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-TRUE16-NEXT: v_mul_lo_u32 v0, s19, v0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-TRUE16-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_12 ; GFX11-TRUE16-NEXT: ; %bb.1: ; %bb14 ; GFX11-TRUE16-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c ; GFX11-TRUE16-NEXT: s_mov_b32 s18, 0 @@ -93,98 +93,100 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-TRUE16-NEXT: s_mov_b32 s14, s21 ; GFX11-TRUE16-NEXT: s_mov_b32 s0, -1 ; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-TRUE16-NEXT: s_branch .LBB2_12 +; GFX11-TRUE16-NEXT: s_branch .LBB2_11 ; GFX11-TRUE16-NEXT: .LBB2_3: ; GFX11-TRUE16-NEXT: s_mov_b32 s0, 0 ; GFX11-TRUE16-NEXT: .LBB2_4: ; %bb16 ; GFX11-TRUE16-NEXT: s_load_b32 s1, s[16:17], 0x54 ; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s23, 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s8, -1 ; GFX11-TRUE16-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-TRUE16-NEXT: s_and_b32 s9, s23, 1 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s1, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 +; GFX11-TRUE16-NEXT: s_not_b32 s1, s1 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s1, 0 ; GFX11-TRUE16-NEXT: s_cselect_b32 s1, -1, 0 ; GFX11-TRUE16-NEXT: s_cmp_eq_u32 s9, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, -1 ; GFX11-TRUE16-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-TRUE16-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-TRUE16-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-TRUE16-NEXT: s_mov_b32 s21, 0 +; GFX11-TRUE16-NEXT: v_mov_b16_e32 v3.l, 0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s24 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mul_hi_u32 s8, s29, s28 -; GFX11-TRUE16-NEXT: s_mul_i32 s9, s29, s28 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s8, s9, 1 -; GFX11-TRUE16-NEXT: s_mov_b32 s9, 0 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s8, v0 +; GFX11-TRUE16-NEXT: s_mul_hi_u32 s9, s29, s28 +; GFX11-TRUE16-NEXT: s_mul_i32 s13, s29, s28 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, s9, s13, 1 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v0 ; GFX11-TRUE16-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s8, 1 +; GFX11-TRUE16-NEXT: s_or_b32 s9, s9, 1 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshr_b32 s8, s8, s30 -; GFX11-TRUE16-NEXT: s_mul_i32 s8, s8, s22 +; GFX11-TRUE16-NEXT: s_lshr_b32 s9, s9, s30 +; GFX11-TRUE16-NEXT: s_mul_i32 s9, s9, s22 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_mul_i32 s8, s8, s20 -; GFX11-TRUE16-NEXT: s_or_b32 s8, s19, s8 +; GFX11-TRUE16-NEXT: s_mul_i32 s9, s9, s20 +; GFX11-TRUE16-NEXT: s_or_b32 s20, s19, s9 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_lshl_b64 s[20:21], s[8:9], 1 -; GFX11-TRUE16-NEXT: s_mov_b32 s8, s9 -; GFX11-TRUE16-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-TRUE16-NEXT: s_lshl_b64 s[22:23], s[20:21], 1 +; GFX11-TRUE16-NEXT: global_load_u16 v2, v0, s[22:23] +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s21 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: .p2align 6 ; GFX11-TRUE16-NEXT: .LBB2_6: ; %bb18 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-TRUE16-NEXT: s_and_b32 s8, 0xffff, s8 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s8, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s1, s8 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, exec_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-TRUE16-NEXT: s_and_b32 s9, 0xffff, s8 +; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s9, 0 +; GFX11-TRUE16-NEXT: s_cselect_b32 s9, -1, 0 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-TRUE16-NEXT: s_cselect_b32 s8, s19, s13 -; GFX11-TRUE16-NEXT: s_and_b32 s13, 0xffff, s9 -; GFX11-TRUE16-NEXT: s_and_b32 s8, s8, 1 -; GFX11-TRUE16-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-TRUE16-NEXT: s_and_b32 s20, s2, exec_lo -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-TRUE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, s9 +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s9, v2 +; GFX11-TRUE16-NEXT: s_and_b32 s19, s2, exec_lo +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s13, v5 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s13, 0 -; GFX11-TRUE16-NEXT: s_cselect_b32 s13, 0x100, 0 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_or_b32 s9, s13, s9 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, v0, v3 +; GFX11-TRUE16-NEXT: s_cselect_b32 s9, s13, s9 +; GFX11-TRUE16-NEXT: s_bitcmp1_b32 s9, 0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-TRUE16-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX11-TRUE16-NEXT: s_cselect_b32 s9, 0x100, 0 +; GFX11-TRUE16-NEXT: s_or_b32 s8, s9, s8 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX11-TRUE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-TRUE16-NEXT: ; %bb.7: ; %Flow -; GFX11-TRUE16-NEXT: s_mov_b32 s8, 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, 0 ; GFX11-TRUE16-NEXT: .LBB2_8: ; %Flow12 ; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s8 -; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_12 -; GFX11-TRUE16-NEXT: ; %bb.9: -; GFX11-TRUE16-NEXT: s_xor_b32 s1, s1, -1 -; GFX11-TRUE16-NEXT: .LBB2_10: ; %bb17 +; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s9 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_11 +; GFX11-TRUE16-NEXT: .LBB2_9: ; %bb17 ; GFX11-TRUE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-TRUE16-NEXT: s_and_b32 vcc_lo, exec_lo, s1 -; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_10 -; GFX11-TRUE16-NEXT: ; %bb.11: ; %Flow6 +; GFX11-TRUE16-NEXT: s_cbranch_vccz .LBB2_9 +; GFX11-TRUE16-NEXT: ; %bb.10: ; %Flow6 ; GFX11-TRUE16-NEXT: s_mov_b32 s18, -1 -; GFX11-TRUE16-NEXT: .LBB2_12: ; %Flow11 +; GFX11-TRUE16-NEXT: .LBB2_11: ; %Flow11 ; GFX11-TRUE16-NEXT: s_and_b32 s20, s0, exec_lo ; GFX11-TRUE16-NEXT: s_or_not1_b32 s0, s18, exec_lo -; GFX11-TRUE16-NEXT: .LBB2_13: ; %Flow9 +; GFX11-TRUE16-NEXT: .LBB2_12: ; %Flow9 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s3, s0 -; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_15 -; GFX11-TRUE16-NEXT: ; %bb.14: ; %bb43 +; GFX11-TRUE16-NEXT: s_cbranch_execz .LBB2_14 +; GFX11-TRUE16-NEXT: ; %bb.13: ; %bb43 ; GFX11-TRUE16-NEXT: s_add_u32 s8, s16, 0x58 ; GFX11-TRUE16-NEXT: s_addc_u32 s9, s17, 0 ; GFX11-TRUE16-NEXT: s_getpc_b64 s[0:1] @@ -196,12 +198,12 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-TRUE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-TRUE16-NEXT: s_or_b32 s20, s20, exec_lo -; GFX11-TRUE16-NEXT: .LBB2_15: ; %Flow14 +; GFX11-TRUE16-NEXT: .LBB2_14: ; %Flow14 ; GFX11-TRUE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-TRUE16-NEXT: s_and_saveexec_b32 s0, s20 -; GFX11-TRUE16-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock +; GFX11-TRUE16-NEXT: ; %bb.15: ; %UnifiedUnreachableBlock ; GFX11-TRUE16-NEXT: ; divergent unreachable -; GFX11-TRUE16-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-TRUE16-NEXT: ; %bb.16: ; %UnifiedReturnBlock ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: f2: @@ -222,7 +224,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-FAKE16-NEXT: v_mul_lo_u32 v0, s19, v0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-FAKE16-NEXT: v_cmpx_eq_u32_e32 0, v0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_13 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_12 ; GFX11-FAKE16-NEXT: ; %bb.1: ; %bb14 ; GFX11-FAKE16-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c ; GFX11-FAKE16-NEXT: s_mov_b32 s18, 0 @@ -246,11 +248,11 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-FAKE16-NEXT: s_mov_b32 s14, s21 ; GFX11-FAKE16-NEXT: s_mov_b32 s2, -1 ; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_4 -; GFX11-FAKE16-NEXT: s_branch .LBB2_12 +; GFX11-FAKE16-NEXT: s_branch .LBB2_11 ; GFX11-FAKE16-NEXT: .LBB2_3: ; GFX11-FAKE16-NEXT: s_mov_b32 s2, 0 ; GFX11-FAKE16-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB2_12 +; GFX11-FAKE16-NEXT: s_cbranch_vccnz .LBB2_11 ; GFX11-FAKE16-NEXT: .LBB2_4: ; %bb16 ; GFX11-FAKE16-NEXT: s_load_b32 s0, s[16:17], 0x54 ; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s23, 0 @@ -258,12 +260,18 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-FAKE16-NEXT: s_and_b32 s1, s23, 1 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s13, -1, 0 +; GFX11-FAKE16-NEXT: s_not_b32 s0, s0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s0, 0 ; GFX11-FAKE16-NEXT: s_mov_b32 s0, -1 ; GFX11-FAKE16-NEXT: s_cselect_b32 s8, -1, 0 ; GFX11-FAKE16-NEXT: s_cmp_eq_u32 s1, 0 ; GFX11-FAKE16-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-FAKE16-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-FAKE16-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 +; GFX11-FAKE16-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, s24 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_mul_hi_u32 s0, s29, s28 ; GFX11-FAKE16-NEXT: s_mul_i32 s1, s29, s28 @@ -271,7 +279,7 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, s0, s1, 1 ; GFX11-FAKE16-NEXT: s_mov_b32 s1, 0 ; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v0 -; GFX11-FAKE16-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s13 ; GFX11-FAKE16-NEXT: s_or_b32 s0, s0, 1 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshr_b32 s0, s0, s30 @@ -281,65 +289,59 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-FAKE16-NEXT: s_or_b32 s0, s19, s0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_lshl_b64 s[20:21], s[0:1], 1 -; GFX11-FAKE16-NEXT: s_mov_b32 s0, s1 -; GFX11-FAKE16-NEXT: global_load_u16 v1, v0, s[20:21] -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 +; GFX11-FAKE16-NEXT: global_load_u16 v2, v3, s[20:21] ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: .p2align 6 ; GFX11-FAKE16-NEXT: .LBB2_6: ; %bb18 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, 0xffff, s1 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v0 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s1, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s1, -1, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s8, s1 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, exec_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 0xffff, v3 +; GFX11-FAKE16-NEXT: s_and_b32 s0, 0xffff, s1 +; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, -1, 0 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-FAKE16-NEXT: s_cselect_b32 s1, s19, s13 -; GFX11-FAKE16-NEXT: s_and_b32 s13, 0xffff, s0 -; GFX11-FAKE16-NEXT: s_and_b32 s1, s1, 1 -; GFX11-FAKE16-NEXT: s_cmp_lg_u32 s13, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, -1, 0 -; GFX11-FAKE16-NEXT: s_and_b32 s20, s9, exec_lo -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, s13 -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v1 +; GFX11-FAKE16-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s0, v2 +; GFX11-FAKE16-NEXT: s_and_b32 s19, s9, exec_lo +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s13, v5 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: v_readfirstlane_b32 s19, v2 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, s19, s13 -; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s13, 0 -; GFX11-FAKE16-NEXT: s_cselect_b32 s13, 0x100, 0 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_or_b32 s0, s13, s0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, v0, v3 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, s13, s0 +; GFX11-FAKE16-NEXT: s_bitcmp1_b32 s0, 0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-FAKE16-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX11-FAKE16-NEXT: s_cselect_b32 s0, 0x100, 0 +; GFX11-FAKE16-NEXT: s_or_b32 s1, s0, s1 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v4 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e32 v3, v1, v3, vcc_lo +; GFX11-FAKE16-NEXT: s_mov_b32 vcc_lo, 0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-FAKE16-NEXT: ; %bb.7: ; %Flow ; GFX11-FAKE16-NEXT: s_mov_b32 s0, 0 ; GFX11-FAKE16-NEXT: .LBB2_8: ; %Flow12 ; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_12 -; GFX11-FAKE16-NEXT: ; %bb.9: -; GFX11-FAKE16-NEXT: s_xor_b32 s0, s8, -1 -; GFX11-FAKE16-NEXT: .LBB2_10: ; %bb17 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_11 +; GFX11-FAKE16-NEXT: .LBB2_9: ; %bb17 ; GFX11-FAKE16-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s0 -; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_10 -; GFX11-FAKE16-NEXT: ; %bb.11: ; %Flow6 +; GFX11-FAKE16-NEXT: s_and_b32 vcc_lo, exec_lo, s8 +; GFX11-FAKE16-NEXT: s_cbranch_vccz .LBB2_9 +; GFX11-FAKE16-NEXT: ; %bb.10: ; %Flow6 ; GFX11-FAKE16-NEXT: s_mov_b32 s18, -1 -; GFX11-FAKE16-NEXT: .LBB2_12: ; %Flow11 +; GFX11-FAKE16-NEXT: .LBB2_11: ; %Flow11 ; GFX11-FAKE16-NEXT: s_and_b32 s20, s2, exec_lo ; GFX11-FAKE16-NEXT: s_or_not1_b32 s0, s18, exec_lo -; GFX11-FAKE16-NEXT: .LBB2_13: ; %Flow9 +; GFX11-FAKE16-NEXT: .LBB2_12: ; %Flow9 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s3, s0 -; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_15 -; GFX11-FAKE16-NEXT: ; %bb.14: ; %bb43 +; GFX11-FAKE16-NEXT: s_cbranch_execz .LBB2_14 +; GFX11-FAKE16-NEXT: ; %bb.13: ; %bb43 ; GFX11-FAKE16-NEXT: s_add_u32 s8, s16, 0x58 ; GFX11-FAKE16-NEXT: s_addc_u32 s9, s17, 0 ; GFX11-FAKE16-NEXT: s_getpc_b64 s[0:1] @@ -351,12 +353,12 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-FAKE16-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-FAKE16-NEXT: s_or_b32 s20, s20, exec_lo -; GFX11-FAKE16-NEXT: .LBB2_15: ; %Flow14 +; GFX11-FAKE16-NEXT: .LBB2_14: ; %Flow14 ; GFX11-FAKE16-NEXT: s_or_b32 exec_lo, exec_lo, s3 ; GFX11-FAKE16-NEXT: s_and_saveexec_b32 s0, s20 -; GFX11-FAKE16-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock +; GFX11-FAKE16-NEXT: ; %bb.15: ; %UnifiedUnreachableBlock ; GFX11-FAKE16-NEXT: ; divergent unreachable -; GFX11-FAKE16-NEXT: ; %bb.17: ; %UnifiedReturnBlock +; GFX11-FAKE16-NEXT: ; %bb.16: ; %UnifiedReturnBlock ; GFX11-FAKE16-NEXT: s_endpgm bb: %i = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll index d75e9932bcd82..fb73e1d674cea 100644 --- a/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll +++ b/llvm/test/CodeGen/AMDGPU/kill-true-in-return-block.ll @@ -6,8 +6,9 @@ define amdgpu_ps float @kill_true(i1 %.not) { ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], exec ; CHECK-NEXT: s_wqm_b64 exec, exec +; CHECK-NEXT: v_not_b32_e32 v0, v0 ; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 -; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %if1 diff --git a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll index e64ec9956860d..1d8939c6e02da 100644 --- a/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-global-non-entry-func.ll @@ -132,8 +132,9 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX8-SDAG-LABEL: func_uses_lds_multi: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_not_b32_e32 v0, v0 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX8-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; GFX8-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-SDAG-NEXT: s_mov_b32 m0, -1 ; GFX8-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX8-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] @@ -208,8 +209,9 @@ define void @func_uses_lds_multi(i1 %cond) { ; GFX9-SDAG-LABEL: func_uses_lds_multi: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_not_b32_e32 v0, v0 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX9-SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; GFX9-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX9-SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GFX9-SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-SDAG-NEXT: s_cbranch_execz .LBB2_2 @@ -263,8 +265,9 @@ define void @func_uses_lds_multi(i1 %cond) { ; SDAG-LABEL: func_uses_lds_multi: ; SDAG: ; %bb.0: ; %entry ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SDAG-NEXT: v_not_b32_e32 v0, v0 ; SDAG-NEXT: v_and_b32_e32 v0, 1, v0 -; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc ; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SDAG-NEXT: s_cbranch_execz .LBB2_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll index 91aba09e942f0..8522807fc4f0e 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ballot.i64.wave32.ll @@ -150,11 +150,13 @@ false: define amdgpu_cs i32 @branch_divergent_ballot64_ne_zero_and(i32 %v1, i32 %v2) { ; DAGISEL-LABEL: branch_divergent_ballot64_ne_zero_and: ; DAGISEL: ; %bb.0: -; DAGISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 -; DAGISEL-NEXT: v_cmp_lt_u32_e64 s0, 34, v1 +; DAGISEL-NEXT: v_cmp_lt_u32_e32 vcc_lo, 34, v1 ; DAGISEL-NEXT: s_mov_b32 s1, 0 -; DAGISEL-NEXT: s_and_b32 s0, vcc_lo, s0 -; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; DAGISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; DAGISEL-NEXT: v_cmp_gt_u32_e32 vcc_lo, 12, v0 +; DAGISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; DAGISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; DAGISEL-NEXT: v_and_b32_e32 v0, 1, v0 ; DAGISEL-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; DAGISEL-NEXT: s_cmp_eq_u64 s[0:1], 0 ; DAGISEL-NEXT: s_cbranch_scc1 .LBB8_2 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll index 6b42c4e72d64a..851462889288d 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.class.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI %s declare i1 @llvm.amdgcn.class.f32(float, i32) #1 @@ -15,6 +16,19 @@ declare double @llvm.fabs.f64(double) #1 ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e32 vcc, s7, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -30,6 +44,19 @@ define amdgpu_kernel void @test_class_f32(ptr addrspace(1) %out, [8 x i32], floa ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fabs_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], |s7|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call float @llvm.fabs.f32(float %a) #1 %result = call i1 @llvm.amdgcn.class.f32(float %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -46,6 +73,19 @@ define amdgpu_kernel void @test_class_fabs_f32(ptr addrspace(1) %out, [8 x i32], ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -s7, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fneg = fsub float -0.0, %a %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -62,6 +102,19 @@ define amdgpu_kernel void @test_class_fneg_f32(ptr addrspace(1) %out, [8 x i32], ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_fabs_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0x1c +; SI-NEXT: s_load_dword s7, s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], -|s7|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call float @llvm.fabs.f32(float %a) #1 %a.fneg.fabs = fsub float -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f32(float %a.fneg.fabs, i32 %b) #1 @@ -77,6 +130,17 @@ define amdgpu_kernel void @test_class_fneg_fabs_f32(ptr addrspace(1) %out, [8 x ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_1_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -90,6 +154,17 @@ define amdgpu_kernel void @test_class_1_f32(ptr addrspace(1) %out, float %a) #0 ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_64_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s6, s[4:5], 0xb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], s6, 64 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -105,6 +180,18 @@ define amdgpu_kernel void @test_class_64_f32(ptr addrspace(1) %out, float %a) #0 ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_full_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x3ff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 1023) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -119,6 +206,18 @@ define amdgpu_kernel void @test_class_full_mask_f32(ptr addrspace(1) %out, float ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_9bit_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dword s4, s[4:5], 0xb +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1ff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -133,6 +232,23 @@ define amdgpu_kernel void @test_class_9bit_mask_f32(ptr addrspace(1) %out, float ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_test_class_full_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_movk_i32 s4, 0x1ff +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v2, s4 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -151,6 +267,22 @@ define amdgpu_kernel void @v_test_class_full_mask_f32(ptr addrspace(1) %out, ptr ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, 1.0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -171,6 +303,23 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f32(ptr a ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_lit_constant_dynamic_mask_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0x44800000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e32 vcc, s4, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -191,6 +340,19 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f32(ptr addrspac ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e32 vcc, s[6:7], v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -206,6 +368,19 @@ define amdgpu_kernel void @test_class_f64(ptr addrspace(1) %out, [8 x i32], doub ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], |s[6:7]|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call double @llvm.fabs.f64(double %a) #1 %result = call i1 @llvm.amdgcn.class.f64(double %a.fabs, i32 %b) #1 %sext = sext i1 %result to i32 @@ -222,6 +397,19 @@ define amdgpu_kernel void @test_class_fabs_f64(ptr addrspace(1) %out, [8 x i32], ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -s[6:7], v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fneg = fsub double -0.0, %a %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg, i32 %b) #1 %sext = sext i1 %result to i32 @@ -238,6 +426,19 @@ define amdgpu_kernel void @test_class_fneg_f64(ptr addrspace(1) %out, [8 x i32], ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x i32], double %a, [8 x i32], i32 %b) #0 { +; SI-LABEL: test_class_fneg_fabs_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dword s8, s[4:5], 0x1d +; SI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x13 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s8 +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], -|s[6:7]|, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %a.fabs = call double @llvm.fabs.f64(double %a) #1 %a.fneg.fabs = fsub double -0.0, %a.fabs %result = call i1 @llvm.amdgcn.class.f64(double %a.fneg.fabs, i32 %b) #1 @@ -250,6 +451,18 @@ define amdgpu_kernel void @test_class_fneg_fabs_f64(ptr addrspace(1) %out, [8 x ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 1{{$}} ; SI: s_endpgm define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 { +; SI-LABEL: test_class_1_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 1) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -260,6 +473,18 @@ define amdgpu_kernel void @test_class_1_f64(ptr addrspace(1) %out, double %a) #0 ; SI: v_cmp_class_f64_e64 {{s\[[0-9]+:[0-9]+\]}}, {{s\[[0-9]+:[0-9]+\]}}, 64{{$}} ; SI: s_endpgm define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) #0 { +; SI-LABEL: test_class_64_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: v_cmp_class_f64_e64 s[0:1], s[2:3], 64 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 64) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -276,6 +501,18 @@ define amdgpu_kernel void @test_class_64_f64(ptr addrspace(1) %out, double %a) # ; SI-NEXT: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x i32], double %a) #0 { +; SI-LABEL: test_class_full_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x13 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0x1ff +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 511) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -291,6 +528,25 @@ define amdgpu_kernel void @test_class_full_mask_f64(ptr addrspace(1) %out, [8 x ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: v_test_class_full_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s4, s2 +; SI-NEXT: s_mov_b32 s5, s3 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_movk_i32 s4, 0x1ff +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_mov_b32 s3, s7 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], s4 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[4:5] +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr double, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -307,6 +563,22 @@ define amdgpu_kernel void @v_test_class_full_mask_f64(ptr addrspace(1) %out, ptr ; SI: v_cmp_class_f64_e32 vcc, ; SI: s_endpgm define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_inline_imm_constant_dynamic_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f64_e32 vcc, 1.0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -322,6 +594,24 @@ define amdgpu_kernel void @test_class_inline_imm_constant_dynamic_mask_f64(ptr a ; SI: v_cmp_class_f64_e32 vcc, s{{\[[0-9]+:[0-9]+\]}}, v{{[0-9]+}} ; SI: s_endpgm define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_class_lit_constant_dynamic_mask_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[4:5], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64 +; SI-NEXT: s_mov_b32 s4, 0 +; SI-NEXT: s_mov_b32 s5, 0x40900000 +; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f64_e32 vcc, s[4:5], v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr i32, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -339,6 +629,29 @@ define amdgpu_kernel void @test_class_lit_constant_dynamic_mask_f64(ptr addrspac ; SI-NOT: v_cmp_class ; SI: s_endpgm define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 3 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -359,6 +672,32 @@ define amdgpu_kernel void @test_fold_or_class_f32_0(ptr addrspace(1) %out, ptr a ; SI-NOT: v_cmp_class ; SI: s_endpgm define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or3_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 1 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -382,6 +721,56 @@ define amdgpu_kernel void @test_fold_or3_class_f32_0(ptr addrspace(1) %out, ptr ; SI-NOT: v_cmp_class ; SI: s_endpgm define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_all_tests_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s3 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[6:7] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_movk_i32 s6, 0x80 +; SI-NEXT: s_movk_i32 s7, 0x100 +; SI-NEXT: s_movk_i32 s8, 0x200 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 2 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 1 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 4 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 8 +; SI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 16 +; SI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 32 +; SI-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 64 +; SI-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s6 +; SI-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5] +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s7 +; SI-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; SI-NEXT: v_or_b32_e32 v1, v2, v1 +; SI-NEXT: v_or_b32_e32 v1, v1, v3 +; SI-NEXT: v_or_b32_e32 v1, v1, v4 +; SI-NEXT: v_or_b32_e32 v1, v1, v5 +; SI-NEXT: v_or_b32_e32 v1, v1, v6 +; SI-NEXT: v_or_b32_e32 v1, v1, v7 +; SI-NEXT: v_or_b32_e32 v1, v1, v8 +; SI-NEXT: v_or_b32_e32 v1, v1, v9 +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, s8 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_or_b32_e32 v0, v1, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -417,6 +806,29 @@ define amdgpu_kernel void @test_fold_or_all_tests_class_f32_0(ptr addrspace(1) % ; SI-NOT: v_cmp_class ; SI: s_endpgm define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_class_f32_1: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 8 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -437,6 +849,25 @@ define amdgpu_kernel void @test_fold_or_class_f32_1(ptr addrspace(1) %out, ptr a ; SI-NOT: v_cmp_class ; SI: s_endpgm define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; SI-LABEL: test_fold_or_class_f32_2: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 7 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -457,6 +888,30 @@ define amdgpu_kernel void @test_fold_or_class_f32_2(ptr addrspace(1) %out, ptr a ; SI: s_or_b64 ; SI: s_endpgm define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, ptr addrspace(1) %in, float %b) #0 { +; SI-LABEL: test_no_fold_or_class_f32_0: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dword s12, s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s10, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: s_mov_b32 s11, s7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v0, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, 4 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_cmp_class_f32_e64 s[0:1], s12, 8 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %gep.in = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -477,6 +932,15 @@ define amdgpu_kernel void @test_no_fold_or_class_f32_0(ptr addrspace(1) %out, pt ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 { +; SI-LABEL: test_class_0_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -489,6 +953,15 @@ define amdgpu_kernel void @test_class_0_f32(ptr addrspace(1) %out, float %a) #0 ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 { +; SI-LABEL: test_class_0_f64: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f64(double %a, i32 0) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -501,6 +974,15 @@ define amdgpu_kernel void @test_class_0_f64(ptr addrspace(1) %out, double %a) #0 ; SI: buffer_store_dword [[RESULT]] ; SI: s_endpgm define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, i32 %b) #0 { +; SI-LABEL: test_class_undef_f32: +; SI: ; %bb.0: +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; SI-NEXT: s_endpgm %result = call i1 @llvm.amdgcn.class.f32(float poison, i32 %b) #1 %sext = sext i1 %result to i32 store i32 %sext, ptr addrspace(1) %out, align 4 @@ -513,6 +995,15 @@ define amdgpu_kernel void @test_class_undef_f32(ptr addrspace(1) %out, float %a, ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]] ; SI-NEXT: s_setpc_b64 define i1 @test_fold_and_ord(float %a) { +; SI-LABEL: test_fold_and_ord: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 35 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 %ord = fcmp ord float %a, %a %and = and i1 %ord, %class @@ -525,6 +1016,15 @@ define i1 @test_fold_and_ord(float %a) { ; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, [[COND]] ; SI-NEXT: s_setpc_b64 define i1 @test_fold_and_unord(float %a) { +; SI-LABEL: test_fold_and_unord: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 35 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: v_cmp_u_f32_e32 vcc, v0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 %ord = fcmp uno float %a, %a %and = and i1 %ord, %class @@ -537,6 +1037,19 @@ define i1 @test_fold_and_unord(float %a) { ; SI: v_cmp_o ; SI: s_and_b64 define i1 @test_fold_and_ord_multi_use(float %a) { +; SI-LABEL: test_fold_and_ord_multi_use: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_cmp_class_f32_e64 s[4:5], v0, 35 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: buffer_store_byte v1, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; SI-NEXT: s_setpc_b64 s[30:31] %class = call i1 @llvm.amdgcn.class.f32(float %a, i32 35) #1 store volatile i1 %class, ptr addrspace(1) poison %ord = fcmp ord float %a, %a diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll index 76cff962f7c20..2bf7e681cb88b 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.div.fmas.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; XUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,VI %s @@ -24,6 +25,24 @@ declare double @llvm.amdgcn.div.fmas.f64(double, double, double, i1) nounwind re ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], [[VC]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x13 +; GCN-NEXT: s_load_dword s7, s[4:5], 0x1c +; GCN-NEXT: s_load_dword s4, s[4:5], 0x25 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s7 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void @@ -37,6 +56,22 @@ define amdgpu_kernel void @test_div_fmas_f32(ptr addrspace(1) %out, [8 x i32], f ; SI: v_div_fmas_f32 [[RESULT:v[0-9]+]], 1.0, [[VB]], [[VC]] ; SI: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_inline_imm_0: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x1c +; GCN-NEXT: s_load_dword s4, s[4:5], 0x25 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_div_fmas_f32 v0, 1.0, v0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void @@ -54,6 +89,22 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_0(ptr addrspace(1) %out, ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], 1.0, [[VC]] ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, float %a, float %b, float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_inline_imm_1: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x16 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0xb +; GCN-NEXT: s_load_dword s4, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_div_fmas_f32 v0, v0, 1.0, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void @@ -71,6 +122,22 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_1(ptr addrspace(1) %out, ; GCN: v_div_fmas_f32 [[RESULT:v[0-9]+]], [[VA]], [[VB]], 1.0 ; GCN: buffer_store_dword [[RESULT]], define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, [8 x i32], float %a, [8 x i32], float %b, [8 x i32], float %c, [8 x i32], i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_inline_imm_2: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s2, s[4:5], 0x2e +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s6, s[4:5], 0x13 +; GCN-NEXT: s_load_dword s4, s[4:5], 0x1c +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s2, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, 1.0 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void @@ -79,6 +146,26 @@ define amdgpu_kernel void @test_div_fmas_f32_inline_imm_2(ptr addrspace(1) %out, ; GCN-LABEL: {{^}}test_div_fmas_f64: ; GCN: v_div_fmas_f64 define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, double %b, double %c, i1 %d) nounwind { +; GCN-LABEL: test_div_fmas_f64: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dword s8, s[4:5], 0x11 +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_bitcmp1_b32 s8, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s8, s0 +; GCN-NEXT: s_mov_b32 s9, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s5 +; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: v_mov_b32_e32 v5, s7 +; GCN-NEXT: v_div_fmas_f64 v[0:1], v[0:1], v[2:3], v[4:5] +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_endpgm %result = call double @llvm.amdgcn.div.fmas.f64(double %a, double %b, double %c, i1 %d) nounwind readnone store double %result, ptr addrspace(1) %out, align 8 ret void @@ -88,6 +175,21 @@ define amdgpu_kernel void @test_div_fmas_f64(ptr addrspace(1) %out, double %a, d ; GCN: s_cmp_eq_u32 s{{[0-9]+}}, 0{{$}} ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %i) nounwind { +; GCN-LABEL: test_div_fmas_f32_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s3, 0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %cmp = icmp eq i32 %i, 0 %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 %cmp) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 @@ -98,6 +200,20 @@ define amdgpu_kernel void @test_div_fmas_f32_cond_to_vcc(ptr addrspace(1) %out, ; GCN: s_mov_b64 vcc, 0 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind { +; GCN-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_mov_b64 vcc, 0 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 false) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void @@ -107,6 +223,20 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_false_cond_to_vcc(ptr addrspace ; GCN: s_mov_b64 vcc, -1 ; GCN: v_div_fmas_f32 {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}}, {{v[0-9]+}} define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace(1) %out, float %a, float %b, float %c) nounwind { +; GCN-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: s_mov_b64 vcc, -1 +; GCN-NEXT: v_div_fmas_f32 v0, v0, v1, v2 +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float %c, i1 true) nounwind readnone store float %result, ptr addrspace(1) %out, align 4 ret void @@ -124,6 +254,36 @@ define amdgpu_kernel void @test_div_fmas_f32_imm_true_cond_to_vcc(ptr addrspace( ; SI: v_div_fmas_f32 {{v[0-9]+}}, [[A]], [[B]], [[C]] ; SI: s_endpgm define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, i32 %d) nounwind { +; GCN-LABEL: test_div_fmas_f32_logical_cond_to_vcc: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s8, s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v2, 0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[4:5], s[2:3] +; GCN-NEXT: buffer_load_dword v3, v[1:2], s[4:7], 0 addr64 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v4, v[1:2], s[4:7], 0 addr64 offset:4 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 offset:8 glc +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s8, 0 +; GCN-NEXT: s_cselect_b64 s[2:3], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; GCN-NEXT: v_and_b32_e32 v0, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s3, s7 +; GCN-NEXT: s_nop 1 +; GCN-NEXT: v_div_fmas_f32 v0, v3, v4, v1 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:8 +; GCN-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.a = getelementptr float, ptr addrspace(1) %in, i32 %tid %gep.b = getelementptr float, ptr addrspace(1) %gep.a, i32 1 @@ -163,6 +323,38 @@ define amdgpu_kernel void @test_div_fmas_f32_logical_cond_to_vcc(ptr addrspace(1 ; SI: s_endpgm define amdgpu_kernel void @test_div_fmas_f32_i1_phi_vcc(ptr addrspace(1) %out, ptr addrspace(1) %in, ptr addrspace(1) %dummy) nounwind { +; GCN-LABEL: test_div_fmas_f32_i1_phi_vcc: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v3, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v4, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[0:1], s[10:11] +; GCN-NEXT: buffer_load_dwordx2 v[1:2], v[3:4], s[0:3], 0 addr64 +; GCN-NEXT: buffer_load_dword v3, v[3:4], s[0:3], 0 addr64 offset:8 +; GCN-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v0 +; GCN-NEXT: s_mov_b64 vcc, 0 +; GCN-NEXT: s_and_saveexec_b64 s[10:11], s[0:1] +; GCN-NEXT: s_cbranch_execz .LBB9_2 +; GCN-NEXT: ; %bb.1: ; %bb +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_mov_b32 s7, s3 +; GCN-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GCN-NEXT: s_and_b64 vcc, vcc, exec +; GCN-NEXT: .LBB9_2: ; %exit +; GCN-NEXT: s_or_b64 exec, exec, s[10:11] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_nop 0 +; GCN-NEXT: v_div_fmas_f32 v0, v1, v2, v3 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s11, s3 +; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 offset:8 +; GCN-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.out = getelementptr float, ptr addrspace(1) %out, i32 2 @@ -188,3 +380,5 @@ exit: store float %result, ptr addrspace(1) %gep.out, align 4 ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; SI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll index 260b6fb39acb9..601007671d3dc 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w32.ll @@ -1624,9 +1624,15 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b32 s2, -1, 0 ; GFX11-NEXT: s_cmp_gt_u32 s3, 2 -; GFX11-NEXT: s_cselect_b32 s3, -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s2, s2, s3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_cselect_b32 s2, -1, 0 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm @@ -1634,13 +1640,17 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX10-LABEL: v_icmp_i1_ne0: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_gt_u32 s2, 1 ; GFX10-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-NEXT: s_cmp_gt_u32 s3, 2 -; GFX10-NEXT: s_cselect_b32 s3, -1, 0 -; GFX10-NEXT: s_and_b32 s2, s2, s3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s2, -1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 1, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s2 ; GFX10-NEXT: global_store_dword v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll index 13a53f0b96de2..71803fa502a25 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.icmp.w64.ll @@ -1894,9 +1894,15 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX11-NEXT: s_cmp_gt_u32 s2, 1 ; GFX11-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX11-NEXT: s_cmp_gt_u32 s3, 2 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GFX11-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v0, s2 ; GFX11-NEXT: v_mov_b32_e32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] @@ -1910,7 +1916,11 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; VI-NEXT: s_cselect_b64 s[4:5], -1, 0 ; VI-NEXT: s_cmp_gt_u32 s3, 2 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_and_b32_e32 v0, v0, v1 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 @@ -1927,7 +1937,11 @@ define amdgpu_kernel void @v_icmp_i1_ne0(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX9-NEXT: s_cselect_b64 s[4:5], -1, 0 ; GFX9-NEXT: s_cmp_gt_u32 s3, 2 ; GFX9-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX9-NEXT: s_and_b64 s[2:3], s[4:5], s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll index d08f826e80625..a1485c5066975 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i32.ll @@ -132,9 +132,12 @@ define amdgpu_cs void @inverse_ballot_branch(i32 inreg %s0_1, i32 inreg %s2, ptr ; ; SDAG-LABEL: inverse_ballot_branch: ; SDAG: ; %bb.0: ; %entry +; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, s1 +; SDAG-NEXT: v_not_b32_e32 v2, v2 +; SDAG-NEXT: v_and_b32_e32 v2, 1, v2 +; SDAG-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 -; SDAG-NEXT: s_xor_b32 s2, s1, -1 -; SDAG-NEXT: s_and_saveexec_b32 s1, s2 +; SDAG-NEXT: s_and_saveexec_b32 s1, vcc_lo ; SDAG-NEXT: ; %bb.1: ; %if ; SDAG-NEXT: s_add_i32 s0, s0, 1 ; SDAG-NEXT: v_mov_b32_e32 v2, s0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll index fb755ea2e5a7f..400619e2bae03 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.inverse.ballot.i64.ll @@ -308,10 +308,13 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; ; SDAG_W64-LABEL: inverse_ballot_branch: ; SDAG_W64: ; %bb.0: ; %entry +; SDAG_W64-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[2:3] +; SDAG_W64-NEXT: v_not_b32_e32 v2, v2 +; SDAG_W64-NEXT: v_and_b32_e32 v2, 1, v2 +; SDAG_W64-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; SDAG_W64-NEXT: v_mov_b32_e32 v3, s1 ; SDAG_W64-NEXT: v_mov_b32_e32 v2, s0 -; SDAG_W64-NEXT: s_xor_b64 s[4:5], s[2:3], -1 -; SDAG_W64-NEXT: s_and_saveexec_b64 s[2:3], s[4:5] +; SDAG_W64-NEXT: s_and_saveexec_b64 s[2:3], vcc ; SDAG_W64-NEXT: ; %bb.1: ; %if ; SDAG_W64-NEXT: s_add_u32 s0, s0, 1 ; SDAG_W64-NEXT: s_addc_u32 s1, s1, 0 @@ -337,9 +340,12 @@ define amdgpu_cs void @inverse_ballot_branch(i64 inreg %s0_1, i64 inreg %s2, ptr ; ; SDAG_W32-LABEL: inverse_ballot_branch: ; SDAG_W32: ; %bb.0: ; %entry +; SDAG_W32-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; SDAG_W32-NEXT: v_not_b32_e32 v2, v2 +; SDAG_W32-NEXT: v_and_b32_e32 v2, 1, v2 +; SDAG_W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v2 ; SDAG_W32-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 -; SDAG_W32-NEXT: s_xor_b32 s3, s2, -1 -; SDAG_W32-NEXT: s_and_saveexec_b32 s2, s3 +; SDAG_W32-NEXT: s_and_saveexec_b32 s2, vcc_lo ; SDAG_W32-NEXT: ; %bb.1: ; %if ; SDAG_W32-NEXT: s_add_u32 s0, s0, 1 ; SDAG_W32-NEXT: s_addc_u32 s1, s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll index 94aad397284ff..21a9911ac8ad8 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.kill.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,SI %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -mattr=+wavefrontsize64 -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX10 %s @@ -38,6 +39,9 @@ define amdgpu_ps void @vcc_implicit_def(float %arg13, float %arg14) { ; GCN-NEXT: %bb. ; GCN-NEXT: s_endpgm define amdgpu_gs void @true() { +; GCN-LABEL: true: +; GCN: ; %bb.0: +; GCN-NEXT: s_endpgm call void @llvm.amdgcn.kill(i1 true) ret void } @@ -226,6 +230,19 @@ define amdgpu_gs void @neg_olt(float %a) { ; GCN: v_cndmask_b32 ; GCN: v_cmp_nle_f32 define amdgpu_ps void @fcmp_x2(float %a) #0 { +; SI-LABEL: fcmp_x2: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s0, 0x3e800000 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc +; SI-NEXT: v_cmp_nle_f32_e32 vcc, 0, v0 +; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB21_1 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB21_1: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm %ogt = fcmp nsz ogt float %a, 2.500000e-01 %k = select i1 %ogt, float -1.000000e+00, float 0.000000e+00 %c = fcmp nsz oge float %k, 0.000000e+00 @@ -242,6 +259,23 @@ define amdgpu_ps void @fcmp_x2(float %a) #0 { ; GCN: s_and{{n2|_not1}}_b64 s[0:1], s[0:1], s[2:3] ; GCN: s_and_b64 exec, exec, s[0:1] define amdgpu_ps float @wqm(float %a) { +; SI-LABEL: wqm: +; SI: ; %bb.0: +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 +; SI-NEXT: s_wqm_b64 s[2:3], vcc +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_andn2_b64 s[2:3], exec, s[2:3] +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: s_cbranch_scc0 .LBB22_2 +; SI-NEXT: ; %bb.1: +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: v_mov_b32_e32 v0, 0 +; SI-NEXT: s_branch .LBB22_3 +; SI-NEXT: .LBB22_2: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB22_3: %c1 = fcmp une float %a, 0.0 %c2 = call i1 @llvm.amdgcn.wqm.vote(i1 %c1) call void @llvm.amdgcn.kill(i1 %c2) @@ -252,6 +286,16 @@ define amdgpu_ps float @wqm(float %a) { ; GCN-LABEL: {{^}}test_sgpr: ; GCN: v_cmp_nle_f32_e64 define amdgpu_ps void @test_sgpr(float inreg %a) #0 { +; SI-LABEL: test_sgpr: +; SI: ; %bb.0: +; SI-NEXT: v_cmp_nle_f32_e64 vcc, s0, 1.0 +; SI-NEXT: s_andn2_b64 exec, exec, vcc +; SI-NEXT: s_cbranch_scc0 .LBB23_1 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB23_1: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm %c = fcmp ole float %a, 1.000000e+00 call void @llvm.amdgcn.kill(i1 %c) #1 ret void @@ -260,6 +304,18 @@ define amdgpu_ps void @test_sgpr(float inreg %a) #0 { ; GCN-LABEL: {{^}}test_non_inline_imm_sgpr: ; GCN-NOT: v_cmp_le_f32_e64 define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 { +; SI-LABEL: test_non_inline_imm_sgpr: +; SI: ; %bb.0: +; SI-NEXT: v_mov_b32_e32 v0, 0x3fc00000 +; SI-NEXT: v_cmp_le_f32_e32 vcc, s0, v0 +; SI-NEXT: s_andn2_b64 s[0:1], exec, vcc +; SI-NEXT: s_andn2_b64 s[2:3], exec, s[0:1] +; SI-NEXT: s_cbranch_scc0 .LBB24_1 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB24_1: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm %c = fcmp ole float %a, 1.500000e+00 call void @llvm.amdgcn.kill(i1 %c) #1 ret void @@ -271,6 +327,30 @@ define amdgpu_ps void @test_non_inline_imm_sgpr(float inreg %a) #0 { ; GCN: s_cmp ; GCN: s_cbranch_scc define amdgpu_ps void @test_scc_liveness() #0 { +; SI-LABEL: test_scc_liveness: +; SI: ; %bb.0: ; %main_body +; SI-NEXT: s_mov_b64 s[0:1], exec +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: .LBB25_1: ; %loop3 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_cmp_gt_i32 s2, 0 +; SI-NEXT: s_cselect_b64 s[4:5], -1, 0 +; SI-NEXT: s_andn2_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], s[4:5] +; SI-NEXT: s_cbranch_scc0 .LBB25_4 +; SI-NEXT: ; %bb.2: ; %loop3 +; SI-NEXT: ; in Loop: Header=BB25_1 Depth=1 +; SI-NEXT: s_and_b64 exec, exec, s[0:1] +; SI-NEXT: s_add_i32 s3, s2, 1 +; SI-NEXT: s_cmp_lt_i32 s2, 1 +; SI-NEXT: s_mov_b32 s2, s3 +; SI-NEXT: s_cbranch_scc1 .LBB25_1 +; SI-NEXT: ; %bb.3: ; %endloop15 +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB25_4: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm main_body: br label %loop3 @@ -292,6 +372,41 @@ endloop15: ; preds = %loop3 ; GCN: s_and{{n2|_not1}}_b64 [[LIVE]], [[LIVE]], exec ; GCN-NEXT: s_cbranch_scc0 define amdgpu_ps void @kill_with_loop_exit(float inreg %inp0, float inreg %inp1, <4 x i32> inreg %inp2, float inreg %inp3) { +; SI-LABEL: kill_with_loop_exit: +; SI: ; %bb.0: ; %.entry +; SI-NEXT: v_mov_b32_e32 v0, 0x43000000 +; SI-NEXT: v_cmp_lt_f32_e32 vcc, s1, v0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_cmp_lt_f32_e32 vcc, s0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_mov_b32_e32 v0, 1.0 +; SI-NEXT: s_cbranch_vccnz .LBB26_5 +; SI-NEXT: ; %bb.1: ; %.preheader1.preheader +; SI-NEXT: v_cmp_ngt_f32_e64 s[0:1], s6, 0 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: s_mov_b64 s[2:3], exec +; SI-NEXT: v_mov_b32_e32 v0, 0x3fc00000 +; SI-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v1 +; SI-NEXT: .LBB26_2: ; %bb +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_and_b64 vcc, exec, s[0:1] +; SI-NEXT: v_add_f32_e32 v0, 0x3e800000, v0 +; SI-NEXT: s_cbranch_vccnz .LBB26_2 +; SI-NEXT: ; %bb.3: ; %bb33 +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_cbranch_scc0 .LBB26_6 +; SI-NEXT: ; %bb.4: ; %bb33 +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: .LBB26_5: ; %bb35 +; SI-NEXT: exp mrt0 v0, v0, v0, v0 done vm +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB26_6: +; SI-NEXT: s_mov_b64 exec, 0 +; SI-NEXT: exp null off, off, off, off done vm +; SI-NEXT: s_endpgm .entry: %tmp24 = fcmp olt float %inp0, 1.280000e+02 %tmp25 = fcmp olt float %inp1, 1.280000e+02 @@ -324,3 +439,5 @@ declare void @llvm.amdgcn.s.sendmsg(i32, i32) #0 declare i1 @llvm.amdgcn.wqm.vote(i1) attributes #0 = { nounwind } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GFX10: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll index 14109391e141a..4beeefb6456b2 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ps.live.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck %s ; CHECK-LABEL: {{^}}test1: @@ -11,6 +12,11 @@ ; the expectation is that the intrinsic will be used in non-trivial shaders, ; so such an optimization doesn't seem worth the effort. define amdgpu_ps float @test1() #0 { +; CHECK-LABEL: test1: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; CHECK-NEXT: ; return to shader part epilog %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 %r = bitcast i32 %live.32 to float @@ -25,6 +31,16 @@ define amdgpu_ps float @test1() #0 { ; CHECK-DAG: v_cndmask_b32_e64 [[VAR:v[0-9]+]], 0, 1, [[COPY]] ; CHECK: image_sample v0, [[VAR]], define amdgpu_ps float @test2() #0 { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: s_mov_b64 s[2:3], s[0:1] +; CHECK-NEXT: s_wqm_b64 exec, exec +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: s_and_b64 exec, exec, s[0:1] +; CHECK-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog %live = call i1 @llvm.amdgcn.ps.live() %live.32 = zext i1 %live to i32 %live.32.bc = bitcast i32 %live.32 to float @@ -40,6 +56,24 @@ define amdgpu_ps float @test2() #0 { ; CHECK-DAG: s_and_saveexec_b64 [[SAVED:s\[[0-9]+:[0-9]+\]]], [[HELPER]] ; CHECK: ; %dead define amdgpu_ps float @test3(i32 %in) #0 { +; CHECK-LABEL: test3: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: s_mov_b64 s[0:1], exec +; CHECK-NEXT: s_wqm_b64 exec, exec +; CHECK-NEXT: s_mov_b64 s[2:3], s[0:1] +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; CHECK-NEXT: v_not_b32_e32 v1, v1 +; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; CHECK-NEXT: s_and_saveexec_b64 s[2:3], vcc +; CHECK-NEXT: ; %bb.1: ; %dead +; CHECK-NEXT: v_lshlrev_b32_e32 v0, 1, v0 +; CHECK-NEXT: ; %bb.2: ; %end +; CHECK-NEXT: s_or_b64 exec, exec, s[2:3] +; CHECK-NEXT: s_and_b64 exec, exec, s[0:1] +; CHECK-NEXT: image_sample v0, v0, s[0:7], s[0:3] dmask:0x1 +; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: ; return to shader part epilog entry: %live = call i1 @llvm.amdgcn.ps.live() br i1 %live, label %end, label %dead diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll index 13ce979a954c2..7bba40ce6ae91 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.wqm.demote.ll @@ -677,19 +677,22 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; SI-NEXT: .LBB6_3: ; %.continue0 ; SI-NEXT: s_or_b64 exec, exec, s[2:3] ; SI-NEXT: s_mov_b64 s[2:3], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] -; SI-NEXT: v_mov_b32_e32 v1, v0 -; SI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SI-NEXT: v_cndmask_b32_e64 v1, 1.0, 0, s[2:3] +; SI-NEXT: v_mov_b32_e32 v2, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: v_xor_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 -; SI-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; SI-NEXT: v_subrev_f32_dpp v1, v1, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SI-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; SI-NEXT: s_and_b64 exec, exec, s[0:1] -; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: s_and_saveexec_b64 s[2:3], vcc +; SI-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; SI-NEXT: s_cbranch_execz .LBB6_6 ; SI-NEXT: ; %bb.4: ; %.demote1 ; SI-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -725,19 +728,22 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX9-NEXT: .LBB6_3: ; %.continue0 ; GFX9-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX9-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] -; GFX9-NEXT: v_mov_b32_e32 v1, v0 -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 1.0, 0, s[2:3] +; GFX9-NEXT: v_mov_b32_e32 v2, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-NEXT: v_xor_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX9-NEXT: v_subrev_f32_dpp v1, v1, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX9-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX9-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX9-NEXT: s_cbranch_execz .LBB6_6 ; GFX9-NEXT: ; %bb.4: ; %.demote1 ; GFX9-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -773,17 +779,21 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-32-NEXT: .LBB6_3: ; %.continue0 ; GFX10-32-NEXT: s_or_b32 exec_lo, exec_lo, s1 ; GFX10-32-NEXT: s_mov_b32 s1, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s1 -; GFX10-32-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-32-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-32-NEXT: v_cndmask_b32_e64 v1, 1.0, 0, s1 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s1 +; GFX10-32-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-32-NEXT: v_subrev_f32_dpp v1, v1, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-32-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX10-32-NEXT: s_and_b32 exec_lo, exec_lo, s0 -; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_xor_b32 s1, s0, -1 -; GFX10-32-NEXT: s_or_b32 s1, s1, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s2, s1 -; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s2 +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v1 +; GFX10-32-NEXT: v_xor_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s1, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s1, exec_lo, s1 ; GFX10-32-NEXT: s_cbranch_execz .LBB6_6 ; GFX10-32-NEXT: ; %bb.4: ; %.demote1 ; GFX10-32-NEXT: s_andn2_b32 s0, s0, exec_lo @@ -819,17 +829,21 @@ define amdgpu_ps void @wqm_deriv(<2 x float> %input, float %arg, i32 %index) { ; GFX10-64-NEXT: .LBB6_3: ; %.continue0 ; GFX10-64-NEXT: s_or_b64 exec, exec, s[2:3] ; GFX10-64-NEXT: s_mov_b64 s[2:3], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 1.0, 0, s[2:3] -; GFX10-64-NEXT: v_mov_b32_e32 v1, v0 -; GFX10-64-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v1 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec +; GFX10-64-NEXT: v_cndmask_b32_e64 v1, 1.0, 0, s[2:3] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX10-64-NEXT: v_mov_b32_e32 v2, v1 +; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-64-NEXT: v_subrev_f32_dpp v1, v1, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-64-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $exec ; GFX10-64-NEXT: s_and_b64 exec, exec, s[0:1] -; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_xor_b64 s[2:3], s[0:1], -1 -; GFX10-64-NEXT: s_or_b64 s[2:3], s[2:3], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] -; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v1 +; GFX10-64-NEXT: v_xor_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[2:3], vcc +; GFX10-64-NEXT: s_xor_b64 s[2:3], exec, s[2:3] ; GFX10-64-NEXT: s_cbranch_execz .LBB6_6 ; GFX10-64-NEXT: ; %bb.4: ; %.demote1 ; GFX10-64-NEXT: s_andn2_b64 s[0:1], s[0:1], exec @@ -912,20 +926,23 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; SI-NEXT: s_cbranch_execz .LBB7_8 ; SI-NEXT: .LBB7_5: ; %.continue0 ; SI-NEXT: ; =>This Inner Loop Header: Depth=1 -; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b64 s[4:5], s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] -; SI-NEXT: v_mov_b32_e32 v2, v0 -; SI-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; SI-NEXT: s_nop 0 -; SI-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SI-NEXT: v_mov_b32_e32 v2, s6 +; SI-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; SI-NEXT: v_mov_b32_e32 v3, v2 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_xor_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; SI-NEXT: s_nop 1 -; SI-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; SI-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; SI-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; SI-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; SI-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; SI-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; SI-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; SI-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; SI-NEXT: v_or_b32_e32 v0, v0, v2 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; SI-NEXT: s_cbranch_execz .LBB7_4 ; SI-NEXT: ; %bb.6: ; %.demote1 ; SI-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -978,20 +995,23 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX9-NEXT: s_cbranch_execz .LBB7_8 ; GFX9-NEXT: .LBB7_5: ; %.continue0 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[4:5] -; GFX9-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; GFX9-NEXT: s_nop 0 -; GFX9-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, s[4:5] +; GFX9-NEXT: v_mov_b32_e32 v3, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9-NEXT: v_xor_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 ; GFX9-NEXT: s_nop 1 -; GFX9-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX9-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX9-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX9-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX9-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX9-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB7_4 ; GFX9-NEXT: ; %bb.6: ; %.demote1 ; GFX9-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1045,16 +1065,20 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-32-NEXT: .LBB7_5: ; %.continue0 ; GFX10-32-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-32-NEXT: s_mov_b32 s3, s0 -; GFX10-32-NEXT: v_cndmask_b32_e64 v0, s2, 0, s3 -; GFX10-32-NEXT: s_xor_b32 s3, s0, -1 -; GFX10-32-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-32-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-32-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-32-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v0 -; GFX10-32-NEXT: s_or_b32 s3, s3, vcc_lo -; GFX10-32-NEXT: s_and_saveexec_b32 s4, s3 -; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s4 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, s2, 0, s3 +; GFX10-32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s3 +; GFX10-32-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-32-NEXT: v_xor_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-32-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-32-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-32-NEXT: v_cmp_neq_f32_e32 vcc_lo, 0, v2 +; GFX10-32-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10-32-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-32-NEXT: s_and_saveexec_b32 s3, vcc_lo +; GFX10-32-NEXT: s_xor_b32 s3, exec_lo, s3 ; GFX10-32-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-32-NEXT: ; %bb.6: ; %.demote1 ; GFX10-32-NEXT: ; in Loop: Header=BB7_5 Depth=1 @@ -1108,16 +1132,20 @@ define amdgpu_ps void @wqm_deriv_loop(<2 x float> %input, float %arg, i32 %index ; GFX10-64-NEXT: .LBB7_5: ; %.continue0 ; GFX10-64-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX10-64-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX10-64-NEXT: v_cndmask_b32_e64 v0, s6, 0, s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], s[0:1], -1 -; GFX10-64-NEXT: v_mov_b32_e32 v2, v0 -; GFX10-64-NEXT: v_mov_b32_dpp v2, v2 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-64-NEXT: v_subrev_f32_dpp v0, v0, v2 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 -; GFX10-64-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $exec -; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v0 -; GFX10-64-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX10-64-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] -; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[8:9] +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, s6, 0, s[4:5] +; GFX10-64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX10-64-NEXT: v_mov_b32_e32 v3, v2 +; GFX10-64-NEXT: v_xor_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_mov_b32_dpp v3, v3 quad_perm:[1,1,1,1] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-64-NEXT: v_subrev_f32_dpp v2, v2, v3 quad_perm:[0,0,0,0] row_mask:0xf bank_mask:0xf bound_ctrl:1 +; GFX10-64-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $exec +; GFX10-64-NEXT: v_cmp_neq_f32_e32 vcc, 0, v2 +; GFX10-64-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX10-64-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-64-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX10-64-NEXT: s_and_saveexec_b64 s[4:5], vcc +; GFX10-64-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GFX10-64-NEXT: s_cbranch_execz .LBB7_4 ; GFX10-64-NEXT: ; %bb.6: ; %.demote1 ; GFX10-64-NEXT: ; in Loop: Header=BB7_5 Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll index 956145fb24c4a..0c009632ff443 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.bf16.ll @@ -166,9 +166,10 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: snan_bf16: @@ -178,9 +179,10 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: snan_bf16: @@ -190,9 +192,10 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: snan_bf16: @@ -200,9 +203,10 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: snan_bf16: @@ -210,9 +214,10 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: snan_bf16: @@ -220,9 +225,10 @@ define i1 @snan_bf16(bfloat %x) nounwind { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1) ; 0x001 ret i1 %1 @@ -392,14 +398,15 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: posnormal_bf16: @@ -409,9 +416,10 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00 -; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: posnormal_bf16: @@ -421,9 +429,10 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00 -; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: posnormal_bf16: @@ -432,9 +441,10 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: posnormal_bf16: @@ -443,9 +453,10 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: posnormal_bf16: @@ -454,9 +465,10 @@ define i1 @posnormal_bf16(bfloat %x) nounwind { ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 256) ; 0x100 ret i1 %1 @@ -467,15 +479,14 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: negnormal_bf16: @@ -485,9 +496,10 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00 -; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: negnormal_bf16: @@ -497,9 +509,10 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00 -; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: negnormal_bf16: @@ -508,9 +521,10 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f00, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: negnormal_bf16: @@ -519,9 +533,10 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negnormal_bf16: @@ -530,9 +545,10 @@ define i1 @negnormal_bf16(bfloat %x) nounwind { ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f00, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 8) ; 0x008 ret i1 %1 @@ -601,14 +617,13 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7CHECK-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0 +; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: negsubnormal_bf16: @@ -618,9 +633,10 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: v_add_u16_e32 v0, -1, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: negsubnormal_bf16: @@ -630,9 +646,10 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, -1, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: negsubnormal_bf16: @@ -641,9 +658,10 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, v1, -1 -; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: negsubnormal_bf16: @@ -652,9 +670,10 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.h, -1 -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negsubnormal_bf16: @@ -663,9 +682,10 @@ define i1 @negsubnormal_bf16(bfloat %x) nounwind { ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, v1, -1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 16) ; 0x010 ret i1 %1 @@ -826,13 +846,12 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: negfinite_bf16: @@ -841,9 +860,10 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, 0, v0 ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: negfinite_bf16: @@ -852,9 +872,10 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, 0, v0 ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: negfinite_bf16: @@ -862,9 +883,10 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: negfinite_bf16: @@ -872,9 +894,10 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: negfinite_bf16: @@ -882,9 +905,10 @@ define i1 @negfinite_bf16(bfloat %x) nounwind { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %1 = call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 56) ; 0x038 ret i1 %1 @@ -1140,13 +1164,13 @@ define <3 x i1> @isnan_v3bf16(<3 x bfloat> %x) nounwind { ; GFX11SELDAG-TRUE16-LABEL: isnan_v3bf16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v2, 0x7fff7fff, v0 -; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff, v1 -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2.l +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 +; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v2.l, 0x7fff, v1.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3.l ; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2.h +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3.h ; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v3.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v2.l ; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; @@ -1634,15 +1658,14 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX7CHECK: ; %bb.0: ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 31, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7eff -; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 -; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7eff +; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_is_plus_normal_bf16: @@ -1652,9 +1675,10 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7eff -; GFX8CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_is_plus_normal_bf16: @@ -1664,9 +1688,10 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7eff -; GFX9CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_is_plus_normal_bf16: @@ -1675,9 +1700,10 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_is_plus_normal_bf16: @@ -1686,9 +1712,10 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_is_plus_normal_bf16: @@ -1697,9 +1724,10 @@ define i1 @not_is_plus_normal_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 767) ; ~0x100 = ~"+normal" ret i1 %class @@ -1711,14 +1739,15 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v1, 16, v0 +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7eff -; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1 -; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7eff +; GFX7CHECK-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_is_neg_normal_bf16: @@ -1728,9 +1757,10 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7eff -; GFX8CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_is_neg_normal_bf16: @@ -1740,9 +1770,10 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7eff -; GFX9CHECK-NEXT: v_cmp_lt_u16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_is_neg_normal_bf16: @@ -1751,9 +1782,10 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX10CHECK-NEXT: v_cmp_lt_u16_e64 s4, 0x7eff, v1 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_is_neg_normal_bf16: @@ -1762,9 +1794,10 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.h -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_is_neg_normal_bf16: @@ -1773,9 +1806,10 @@ define i1 @not_is_neg_normal_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, -1, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, 0xff80, v1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e64 s0, 0x7eff, v1 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_u16_e32 vcc_lo, 0x7eff, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 1015) ; ~0x008 = ~"-normal" ret i1 %class @@ -2068,50 +2102,56 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f80 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v0 -; GFX7CHECK-NEXT: s_mov_b32 s7, 0xff80 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s7, v1 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: s_mov_b32 s5, 0xff80 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s5, v1 +; GFX7CHECK-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_ispositive_bf16: ; GFX8CHECK: ; %bb.0: ; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX8CHECK-NEXT: s_movk_i32 s6, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, 0, v0 -; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s6, v1 -; GFX8CHECK-NEXT: s_movk_i32 s7, 0xff80 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s7, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s6, v1 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_and_b32_e32 v2, 0x7fff, v0 +; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v2 +; GFX8CHECK-NEXT: s_movk_i32 s5, 0xff80 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0 +; GFX8CHECK-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v2 +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_ispositive_bf16: ; GFX9CHECK: ; %bb.0: ; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX9CHECK-NEXT: s_movk_i32 s6, 0x7f80 ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, 0, v0 -; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s6, v1 -; GFX9CHECK-NEXT: s_movk_i32 s7, 0xff80 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s7, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s6, v1 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_and_b32_e32 v2, 0x7fff, v0 +; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v2 +; GFX9CHECK-NEXT: s_movk_i32 s5, 0xff80 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s5, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v2 +; GFX9CHECK-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_ispositive_bf16: @@ -2119,13 +2159,15 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0xff80, v0 -; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v1 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s6, 0x7f80, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: s_or_b32 s4, s4, s5 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, s6 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0 +; GFX10CHECK-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or3_b32 v0, v2, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_ispositive_bf16: @@ -2133,13 +2175,15 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0.h -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s2, 0x7f80, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s2 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_ispositive_bf16: @@ -2147,13 +2191,15 @@ define i1 @not_ispositive_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s2, 0x7f80, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s2 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or3_b32 v0, v2, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 63) ; ~fcPositive ret i1 %class @@ -2165,80 +2211,86 @@ define i1 @isnegative_bf16(bfloat %x) { ; GFX7CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX7CHECK-NEXT: v_ashrrev_i32_e32 v2, 16, v0 +; GFX7CHECK-NEXT: v_lshrrev_b32_e32 v2, 31, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GFX7CHECK-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 -; GFX7CHECK-NEXT: s_mov_b32 s6, 0xff80 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s6, v1 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: s_mov_b32 s4, 0xff80 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v1 +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: isnegative_bf16: ; GFX8CHECK: ; %bb.0: ; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, 0, v0 -; GFX8CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v1 -; GFX8CHECK-NEXT: s_movk_i32 s6, 0xff80 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s6, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_and_b32_e32 v2, 0x7fff, v0 +; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v2 +; GFX8CHECK-NEXT: s_movk_i32 s4, 0xff80 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: isnegative_bf16: ; GFX9CHECK: ; %bb.0: ; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 -; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX9CHECK-NEXT: s_movk_i32 s4, 0xff80 +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, 0, v0 -; GFX9CHECK-NEXT: v_cmp_gt_i16_e64 s[4:5], s4, v1 -; GFX9CHECK-NEXT: s_movk_i32 s6, 0xff80 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s6, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: isnegative_bf16: ; GFX10CHECK: ; %bb.0: ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0 ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0xff80, v0 -; GFX10CHECK-NEXT: v_cmp_gt_i16_e64 s4, 0x7f80, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: s_or_b32 s4, s4, s5 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: isnegative_bf16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: isnegative_bf16: ; GFX11SELDAG-FAKE16: ; %bb.0: ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0xff80, v0 ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0xff80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e64 s0, 0x7f80, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_or_b32 v0, v1, v0, v2 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 60) ; fcNegative ret i1 %class @@ -2254,9 +2306,10 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v1 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_isnegative_bf16: @@ -2266,9 +2319,10 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_isnegative_bf16: @@ -2278,9 +2332,10 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_isnegative_bf16: @@ -2288,9 +2343,10 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v1 -; GFX10CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_isnegative_bf16: @@ -2298,9 +2354,10 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.h, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_isnegative_bf16: @@ -2308,9 +2365,10 @@ define i1 @not_isnegative_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f81, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v1 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] %class = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 963) ; ~fcNegative ret i1 %class @@ -2324,9 +2382,10 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: iszero_or_nan_bf16: @@ -2335,9 +2394,10 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: iszero_or_nan_bf16: @@ -2346,9 +2406,10 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: iszero_or_nan_bf16: @@ -2356,9 +2417,10 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_bf16: @@ -2366,9 +2428,10 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_bf16: @@ -2376,9 +2439,10 @@ define i1 @iszero_or_nan_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan" @@ -2393,9 +2457,10 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: iszero_or_nan_f_daz: @@ -2404,9 +2469,10 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: iszero_or_nan_f_daz: @@ -2415,9 +2481,10 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: iszero_or_nan_f_daz: @@ -2425,9 +2492,10 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_daz: @@ -2435,9 +2503,10 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_daz: @@ -2445,9 +2514,10 @@ define i1 @iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan" @@ -2462,9 +2532,10 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: iszero_or_nan_f_maybe_daz: @@ -2473,9 +2544,10 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: iszero_or_nan_f_maybe_daz: @@ -2484,9 +2556,10 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: iszero_or_nan_f_maybe_daz: @@ -2494,9 +2567,10 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: iszero_or_nan_f_maybe_daz: @@ -2504,9 +2578,10 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_nan_f_maybe_daz: @@ -2514,9 +2589,10 @@ define i1 @iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 99) ; 0x60|0x3 = "zero|nan" @@ -2531,9 +2607,10 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_iszero_or_nan_bf16: @@ -2542,9 +2619,10 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_iszero_or_nan_bf16: @@ -2553,9 +2631,10 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_iszero_or_nan_bf16: @@ -2563,9 +2642,10 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_bf16: @@ -2573,9 +2653,10 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_bf16: @@ -2583,9 +2664,10 @@ define i1 @not_iszero_or_nan_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~0x60 = "~(zero|nan)" @@ -2600,9 +2682,10 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_iszero_or_nan_f_daz: @@ -2611,9 +2694,10 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_iszero_or_nan_f_daz: @@ -2622,9 +2706,10 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_iszero_or_nan_f_daz: @@ -2632,9 +2717,10 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_daz: @@ -2642,9 +2728,10 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_daz: @@ -2652,9 +2739,10 @@ define i1 @not_iszero_or_nan_f_daz(bfloat %x) #0 { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" @@ -2669,9 +2757,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -2680,9 +2769,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -2691,9 +2781,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f81 ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_ne_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -2701,9 +2792,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX10CHECK-NEXT: v_cmp_ne_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -2711,9 +2803,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -2721,9 +2814,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(bfloat %x) #1 { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7f81, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_ne_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 924) ; ~(0x60|0x3) = "~(zero|nan)" @@ -2738,9 +2832,10 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: iszero_or_qnan_bf16: @@ -2749,9 +2844,10 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: iszero_or_qnan_bf16: @@ -2760,9 +2856,10 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7fbf ; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_eq_u16_e64 s[4:5], 0, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: iszero_or_qnan_bf16: @@ -2770,9 +2867,10 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s4, 0, v0 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: iszero_or_qnan_bf16: @@ -2780,9 +2878,10 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_qnan_bf16: @@ -2790,9 +2889,10 @@ define i1 @iszero_or_qnan_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s0, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 98) ; 0x60|0x2 = "zero|qnan" @@ -2808,11 +2908,13 @@ define i1 @iszero_or_snan_bf16(bfloat %x) { ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s4, v0 -; GFX7CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: iszero_or_snan_bf16: @@ -2822,61 +2924,67 @@ define i1 @iszero_or_snan_bf16(bfloat %x) { ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7fc0 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0 -; GFX8CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: iszero_or_snan_bf16: ; GFX9CHECK: ; %bb.0: ; %entry ; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7fc0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s4, v0 -; GFX9CHECK-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, 0, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: iszero_or_snan_bf16: ; GFX10CHECK: ; %bb.0: ; %entry ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0, v0 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: s_or_b32 s4, s5, s4 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: iszero_or_snan_bf16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s1, s0 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: iszero_or_snan_bf16: ; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0, v0 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s1, s0 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_or_b32 v0, v0, v2, v1 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 97) ; 0x60|0x1 = "zero|snan" @@ -2890,22 +2998,26 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX7CHECK-NEXT: v_mul_f32_e32 v0, 1.0, v0 ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fc0 -; GFX7CHECK-NEXT: s_movk_i32 s8, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_cmp_lt_i32_e64 s[4:5], s8, v0 -; GFX7CHECK-NEXT: s_and_b64 s[6:7], s[4:5], vcc -; GFX7CHECK-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0 +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7CHECK-NEXT: v_add_i32_e32 v3, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v3 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX7CHECK-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_iszero_or_qnan_bf16: @@ -2913,21 +3025,25 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX8CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7fc0 -; GFX8CHECK-NEXT: s_movk_i32 s8, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s8, v0 -; GFX8CHECK-NEXT: s_and_b64 s[6:7], s[4:5], vcc -; GFX8CHECK-NEXT: v_add_u16_e32 v1, -1, v0 +; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_add_u16_e32 v3, -1, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s8, v0 -; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v1 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX8CHECK-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v3 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 -; GFX8CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX8CHECK-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_iszero_or_qnan_bf16: @@ -2935,75 +3051,87 @@ define i1 @not_iszero_or_qnan_bf16(bfloat %x) { ; GFX9CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7fc0 -; GFX9CHECK-NEXT: s_movk_i32 s8, 0x7f80 ; GFX9CHECK-NEXT: v_cmp_gt_i16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_cmp_lt_i16_e64 s[4:5], s8, v0 -; GFX9CHECK-NEXT: s_and_b64 s[6:7], s[4:5], vcc -; GFX9CHECK-NEXT: v_add_u16_e32 v1, -1, v0 +; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_add_u16_e32 v3, -1, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s8, v0 -; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v1 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GFX9CHECK-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v3 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 -; GFX9CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_iszero_or_qnan_bf16: ; GFX10CHECK: ; %bb.0: ; %entry ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX10CHECK-NEXT: v_add_nc_u16 v1, v0, -1 ; GFX10CHECK-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s4, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cmp_eq_u16_e64 s5, 0x7f80, v0 +; GFX10CHECK-NEXT: v_add_nc_u16 v2, v0, -1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v0, 0xff80, v0 -; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s6, 0x7f, v1 -; GFX10CHECK-NEXT: s_and_b32 s4, s4, vcc_lo +; GFX10CHECK-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v2 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 -; GFX10CHECK-NEXT: s_or_b32 s5, s6, s5 -; GFX10CHECK-NEXT: s_or_b32 s4, s5, s4 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_qnan_bf16: ; GFX11SELDAG-TRUE16: ; %bb.0: ; %entry ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.l, -1 ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.l, -1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.l, 0xff80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v0.h -; GFX11SELDAG-TRUE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo ; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0.l -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s1, s2, s1 -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s1, s0 -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_qnan_bf16: ; GFX11SELDAG-FAKE16: ; %bb.0: ; %entry ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, v0, -1 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_i16_e32 vcc_lo, 0x7fc0, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s0, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e64 s1, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v2, v0, -1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7f80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v0, 0xff80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f, v1 -; GFX11SELDAG-FAKE16-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s1, s2, s1 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s1, s0 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v2, v2, v4 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or3_b32 v0, v2, v1, v0 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 925) ; ~(0x60|0x2) = "~(zero|qnan)" @@ -3018,19 +3146,22 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX7CHECK-NEXT: v_bfe_u32 v0, v0, 16, 15 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX7CHECK-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; GFX7CHECK-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7CHECK-NEXT: v_add_i32_e32 v2, vcc, -1, v0 ; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX7CHECK-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7fbf -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v2 +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7fbf +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7CHECK-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7CHECK-NEXT: v_add_i32_e32 v0, vcc, 0xffffff80, v0 ; GFX7CHECK-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX7CHECK-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7CHECK-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX8CHECK-LABEL: not_iszero_or_snan_bf16: @@ -3039,18 +3170,21 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX8CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX8CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 -; GFX8CHECK-NEXT: v_add_u16_e32 v1, -1, v0 +; GFX8CHECK-NEXT: v_add_u16_e32 v2, -1, v0 ; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX8CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v1 -; GFX8CHECK-NEXT: s_movk_i32 s6, 0x7fbf -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s6, v0 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v2 +; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7fbf +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 ; GFX8CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 -; GFX8CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0 -; GFX8CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX8CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX8CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX8CHECK-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX8CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8CHECK-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX8CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX9CHECK-LABEL: not_iszero_or_snan_bf16: @@ -3059,18 +3193,20 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX9CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f80 ; GFX9CHECK-NEXT: v_cmp_eq_u16_e32 vcc, s4, v0 -; GFX9CHECK-NEXT: v_add_u16_e32 v1, -1, v0 +; GFX9CHECK-NEXT: v_add_u16_e32 v2, -1, v0 ; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f -; GFX9CHECK-NEXT: v_cmp_gt_u16_e64 s[4:5], s4, v1 -; GFX9CHECK-NEXT: s_movk_i32 s6, 0x7fbf -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s6, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v2 +; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7fbf +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_lt_i16_e32 vcc, s4, v0 ; GFX9CHECK-NEXT: v_add_u16_e32 v0, 0xff80, v0 -; GFX9CHECK-NEXT: s_movk_i32 s6, 0x7f00 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s6, v0 -; GFX9CHECK-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX9CHECK-NEXT: s_movk_i32 s4, 0x7f00 +; GFX9CHECK-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX9CHECK-NEXT: v_cmp_gt_u16_e32 vcc, s4, v0 +; GFX9CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9CHECK-NEXT: v_or3_b32 v0, v1, v2, v0 ; GFX9CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX10CHECK-LABEL: not_iszero_or_snan_bf16: @@ -3078,15 +3214,17 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX10CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10CHECK-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX10CHECK-NEXT: v_add_nc_u16 v1, v0, -1 -; GFX10CHECK-NEXT: v_add_nc_u16 v2, 0xff80, v0 ; GFX10CHECK-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 -; GFX10CHECK-NEXT: v_cmp_lt_i16_e64 s5, 0x7fbf, v0 -; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s4, 0x7f, v1 -; GFX10CHECK-NEXT: v_cmp_gt_u16_e64 s6, 0x7f00, v2 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, vcc_lo -; GFX10CHECK-NEXT: s_or_b32 s4, s4, s5 -; GFX10CHECK-NEXT: s_or_b32 s4, s4, s6 -; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX10CHECK-NEXT: v_add_nc_u16 v3, 0xff80, v0 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v1 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 +; GFX10CHECK-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v3 +; GFX10CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX10CHECK-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX10CHECK-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-TRUE16-LABEL: not_iszero_or_snan_bf16: @@ -3094,15 +3232,17 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX11SELDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-TRUE16-NEXT: v_and_b16 v0.l, 0x7fff, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, v0.l, -1 -; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v1.l, 0xff80, v0.l ; GFX11SELDAG-TRUE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0.l -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v0.h -; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f00, v1.l -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11SELDAG-TRUE16-NEXT: s_or_b32 s0, s0, s2 -; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_add_nc_u16 v0.h, 0xff80, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0.l +; GFX11SELDAG-TRUE16-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v0.h +; GFX11SELDAG-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-TRUE16-NEXT: v_or3_b32 v0, v1, v3, v0 ; GFX11SELDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11SELDAG-FAKE16-LABEL: not_iszero_or_snan_bf16: @@ -3110,15 +3250,17 @@ define i1 @not_iszero_or_snan_bf16(bfloat %x) { ; GFX11SELDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11SELDAG-FAKE16-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v1, v0, -1 -; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v2, 0xff80, v0 ; GFX11SELDAG-FAKE16-NEXT: v_cmp_eq_u16_e32 vcc_lo, 0x7f80, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e64 s1, 0x7fbf, v0 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s0, 0x7f, v1 -; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e64 s2, 0x7f00, v2 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, vcc_lo -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s1 -; GFX11SELDAG-FAKE16-NEXT: s_or_b32 s0, s0, s2 -; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11SELDAG-FAKE16-NEXT: v_add_nc_u16 v3, 0xff80, v0 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f, v1 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_lt_i16_e32 vcc_lo, 0x7fbf, v0 +; GFX11SELDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_cmp_gt_u16_e32 vcc_lo, 0x7f00, v3 +; GFX11SELDAG-FAKE16-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11SELDAG-FAKE16-NEXT: v_or3_b32 v0, v1, v0, v2 ; GFX11SELDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] entry: %0 = tail call i1 @llvm.is.fpclass.bf16(bfloat %x, i32 926) ; ~(0x60|0x1) = "~(zero|snan)" diff --git a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll index 18c462ffd0ff5..761558443bebd 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.is.fpclass.f16.ll @@ -172,9 +172,10 @@ define i1 @snan_f16(half %x) nounwind { ; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s5, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: snan_f16: @@ -462,15 +463,16 @@ define i1 @posnormal_f16(half %x) nounwind { ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800 ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: posnormal_f16: @@ -547,15 +549,14 @@ define i1 @negnormal_f16(half %x) nounwind { ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800 +; GFX7SELDAG-NEXT: v_lshrrev_b32_e32 v1, 15, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], vcc, s[4:5] -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: negnormal_f16: @@ -706,14 +707,13 @@ define i1 @negsubnormal_f16(half %x) nounwind { ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 -; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_add_i32_e64 v0, s[4:5], -1, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_lshrrev_b32_e32 v1, 15, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, -1, v0 +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: negsubnormal_f16: @@ -1002,12 +1002,11 @@ define i1 @negfinite_f16(half %x) nounwind { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7SELDAG-NEXT: v_lshrrev_b32_e32 v1, 15, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: negfinite_f16: @@ -2269,15 +2268,14 @@ define i1 @not_is_plus_normal_f16(half %x) { ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x77ff +; GFX7SELDAG-NEXT: v_lshrrev_b32_e32 v1, 15, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], 0, v1 -; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_is_plus_normal_f16: @@ -2363,15 +2361,16 @@ define i1 @not_is_neg_normal_f16(half %x) { ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x77ff +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x77ff ; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v1 -; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_is_neg_normal_f16: @@ -2851,18 +2850,19 @@ define i1 @not_ispositive_f16(half %x) { ; GFX7SELDAG: ; %bb.0: ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7c00 -; GFX7SELDAG-NEXT: s_mov_b32 s7, 0xfc00 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: s_mov_b32 s5, 0xfc00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s6, v2 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s7, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v2 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v2 +; GFX7SELDAG-NEXT: v_lshrrev_b32_e32 v1, 15, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s5, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, v3, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v2 +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_ispositive_f16: @@ -2941,15 +2941,15 @@ define i1 @isnegative_f16(half %x) { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7SELDAG-NEXT: s_mov_b32 s6, 0xfc00 -; GFX7SELDAG-NEXT: v_bfe_i32 v1, v0, 0, 16 ; GFX7SELDAG-NEXT: v_and_b32_e32 v2, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX7SELDAG-NEXT: v_cmp_gt_i32_e64 s[4:5], s4, v2 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v2 +; GFX7SELDAG-NEXT: s_mov_b32 s4, 0xfc00 +; GFX7SELDAG-NEXT: v_lshrrev_b32_e32 v1, 15, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: isnegative_f16: @@ -3026,12 +3026,13 @@ define i1 @not_isnegative_f16(half %x) { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 -; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 ; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_isnegative_f16: @@ -3110,9 +3111,10 @@ define i1 @iszero_or_nan_f16(half %x) { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: iszero_or_nan_f16: @@ -3189,9 +3191,10 @@ define i1 @iszero_or_nan_f_daz(half %x) #0 { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: iszero_or_nan_f_daz: @@ -3268,9 +3271,10 @@ define i1 @iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: iszero_or_nan_f_maybe_daz: @@ -3347,9 +3351,10 @@ define i1 @not_iszero_or_nan_f16(half %x) { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_iszero_or_nan_f16: @@ -3435,9 +3440,10 @@ define i1 @not_iszero_or_nan_f_daz(half %x) #0 { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_iszero_or_nan_f_daz: @@ -3523,9 +3529,10 @@ define i1 @not_iszero_or_nan_f_maybe_daz(half %x) #1 { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c01 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_iszero_or_nan_f_maybe_daz: @@ -3611,9 +3618,10 @@ define i1 @iszero_or_qnan_f16(half %x) { ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: iszero_or_qnan_f16: @@ -3691,11 +3699,13 @@ define i1 @iszero_or_snan_f16(half %x) { ; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s5, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[4:5], s[4:5], vcc +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s5, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: iszero_or_snan_f16: @@ -3772,23 +3782,27 @@ define i1 @not_iszero_or_qnan_f16(half %x) { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7e00 -; GFX7SELDAG-NEXT: s_movk_i32 s8, 0x7c00 +; GFX7SELDAG-NEXT: s_movk_i32 s5, 0x7c00 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_gt_i32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e64 s[4:5], s8, v0 -; GFX7SELDAG-NEXT: s_and_b64 s[6:7], s[4:5], vcc -; GFX7SELDAG-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s5, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s5, v0 +; GFX7SELDAG-NEXT: v_and_b32_e32 v1, v2, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7SELDAG-NEXT: v_add_i32_e32 v3, vcc, -1, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff -; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s8, v0 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v3 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800 +; GFX7SELDAG-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_iszero_or_qnan_f16: @@ -3876,21 +3890,24 @@ define i1 @not_iszero_or_snan_f16(half %x) { ; GFX7SELDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX7SELDAG-NEXT: v_cvt_f16_f32_e32 v0, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7c00 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7dff ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0x7fff, v0 ; GFX7SELDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v0 -; GFX7SELDAG-NEXT: v_add_i32_e64 v1, s[4:5], -1, v0 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX7SELDAG-NEXT: v_add_i32_e32 v2, vcc, -1, v0 ; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x3ff -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], s4, v1 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v2 +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7dff +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX7SELDAG-NEXT: v_cmp_lt_i32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX7SELDAG-NEXT: v_add_i32_e32 v0, vcc, 0xfffffc00, v0 ; GFX7SELDAG-NEXT: v_and_b32_e32 v0, 0xffff, v0 -; GFX7SELDAG-NEXT: s_movk_i32 s6, 0x7800 -; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s6, v0 -; GFX7SELDAG-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GFX7SELDAG-NEXT: s_movk_i32 s4, 0x7800 +; GFX7SELDAG-NEXT: v_cmp_gt_u32_e32 vcc, s4, v0 +; GFX7SELDAG-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX7SELDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX7SELDAG-NEXT: v_or_b32_e32 v0, v1, v0 ; GFX7SELDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX7GLISEL-LABEL: not_iszero_or_snan_f16: diff --git a/llvm/test/CodeGen/AMDGPU/loop_break.ll b/llvm/test/CodeGen/AMDGPU/loop_break.ll index fcae73c763682..72fab4b91502e 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_break.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_break.ll @@ -500,10 +500,12 @@ define amdgpu_kernel void @invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] ; GCN-NEXT: .LBB5_3: ; %Flow ; GCN-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-NEXT: s_xor_b64 s[8:9], s[4:5], -1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GCN-NEXT: v_not_b32_e32 v1, v1 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_add_i32 s6, s6, 1 -; GCN-NEXT: s_and_b64 s[8:9], exec, s[8:9] -; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[0:1] +; GCN-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GCN-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GCN-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-NEXT: ; %bb.4: ; %bb9 diff --git a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll index e37dcf60506be..a8792e9a4a3aa 100644 --- a/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll +++ b/llvm/test/CodeGen/AMDGPU/loop_exit_with_xor.ll @@ -99,8 +99,9 @@ define void @break_cond_is_arg(i32 %arg, i1 %breakcond) { ; GCN-LABEL: break_cond_is_arg: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-NEXT: v_not_b32_e32 v1, v1 ; GCN-NEXT: v_and_b32_e32 v1, 1, v1 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GCN-NEXT: s_mov_b32 s10, 1 ; GCN-NEXT: s_mov_b64 s[6:7], 0 ; GCN-NEXT: s_branch .LBB2_2 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll index 34a9624cb19eb..f83201fbffced 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-loop-var-out-of-divergent-loop-swdev407790.ll @@ -8,11 +8,12 @@ define void @machinesink_loop_variable_out_of_divergent_loop(i32 %arg, i1 %cmp49 ; CHECK-LABEL: machinesink_loop_variable_out_of_divergent_loop: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 +; CHECK-NEXT: v_not_b32_e32 v1, v1 ; CHECK-NEXT: v_and_b32_e32 v3, 1, v3 ; CHECK-NEXT: s_mov_b32 s6, 0 -; CHECK-NEXT: v_cmp_ne_u32_e64 s4, 1, v1 +; CHECK-NEXT: v_and_b32_e32 v1, 1, v1 ; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v3 +; CHECK-NEXT: v_cmp_eq_u32_e64 s4, 1, v1 ; CHECK-NEXT: s_inst_prefetch 0x1 ; CHECK-NEXT: s_branch .LBB0_3 ; CHECK-NEXT: .p2align 6 diff --git a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll index 553d7e09390fd..d05b2e770ba9b 100644 --- a/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll +++ b/llvm/test/CodeGen/AMDGPU/machine-sink-temporal-divergence-swdev407790.ll @@ -338,13 +338,16 @@ define protected amdgpu_kernel void @kernel_round1(ptr addrspace(1) nocapture no ; CHECK-NEXT: ; in Loop: Header=BB0_5 Depth=1 ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s55 ; CHECK-NEXT: ; %bb.24: ; in Loop: Header=BB0_5 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 -; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v47 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, 59, v47 ; CHECK-NEXT: v_add_nc_u32_e32 v46, 1, v46 ; CHECK-NEXT: s_mov_b32 s55, s54 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s53, s4, s53 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s54, v45 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; CHECK-NEXT: v_or_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; CHECK-NEXT: s_or_b32 s53, vcc_lo, s53 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s53 ; CHECK-NEXT: s_cbranch_execnz .LBB0_5 ; CHECK-NEXT: .LBB0_25: ; %Flow51 @@ -966,12 +969,15 @@ define protected amdgpu_kernel void @kernel_round1_short(ptr addrspace(1) nocapt ; CHECK-NEXT: s_or_b32 exec_lo, exec_lo, s54 ; CHECK-NEXT: ; %bb.12: ; %.32 ; CHECK-NEXT: ; in Loop: Header=BB1_1 Depth=1 -; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 -; CHECK-NEXT: v_cmp_lt_u32_e64 s4, 59, v43 -; CHECK-NEXT: s_or_b32 s4, vcc_lo, s4 -; CHECK-NEXT: s_and_b32 s4, exec_lo, s4 -; CHECK-NEXT: s_or_b32 s52, s4, s52 +; CHECK-NEXT: v_cmp_lt_u32_e32 vcc_lo, 59, v43 ; CHECK-NEXT: s_mov_b32 s4, s53 +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; CHECK-NEXT: v_cmp_ge_u32_e32 vcc_lo, s53, v45 +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; CHECK-NEXT: v_or_b32_e32 v0, v1, v0 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; CHECK-NEXT: s_or_b32 s52, vcc_lo, s52 ; CHECK-NEXT: s_andn2_b32 exec_lo, exec_lo, s52 ; CHECK-NEXT: s_cbranch_execnz .LBB1_1 ; CHECK-NEXT: ; %bb.13: ; %.119 diff --git a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll index 0f67a404972aa..012632ccf4b6b 100644 --- a/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll +++ b/llvm/test/CodeGen/AMDGPU/mixed-vmem-types.ll @@ -8,35 +8,40 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX11: ; %bb.0: ; %.entry ; GFX11-NEXT: s_getpc_b64 s[4:5] ; GFX11-NEXT: s_mov_b32 s0, s3 -; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s1, s5 -; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX11-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: buffer_load_b32 v1, off, s[20:23], 0 -; GFX11-NEXT: buffer_load_b32 v2, off, s[16:19], 0 -; GFX11-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-NEXT: buffer_load_b32 v0, off, s[16:19], 0 +; GFX11-NEXT: buffer_load_b32 v2, off, s[20:23], 0 ; GFX11-NEXT: buffer_load_b32 v4, off, s[40:43], 0 -; GFX11-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 -; GFX11-NEXT: s_waitcnt vmcnt(4) -; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 -; GFX11-NEXT: s_waitcnt vmcnt(3) -; GFX11-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) -; GFX11-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_waitcnt vmcnt(1) +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, 0xbc00bc00 :: v_dual_and_b32 v0, v0, v2 +; GFX11-NEXT: image_sample_lz v3, v1, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX11-NEXT: image_sample_lz v1, v1, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX11-NEXT: s_waitcnt vmcnt(1) -; GFX11-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 -; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX11-NEXT: s_and_b32 s0, s0, s1 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 s0, s0, s2 -; GFX11-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 +; GFX11-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: buffer_store_b32 v0, off, s[24:27], 0 ; GFX11-NEXT: s_endpgm ; @@ -45,35 +50,46 @@ define amdgpu_cs void @mixed_vmem_types(i32 inreg %globalTable, i32 inreg %perSh ; GFX12-NEXT: s_getpc_b64 s[4:5] ; GFX12-NEXT: s_mov_b32 s0, s3 ; GFX12-NEXT: s_sext_i32_i16 s5, s5 -; GFX12-NEXT: v_mov_b32_e32 v0, 0xbc00bc00 -; GFX12-NEXT: s_mov_b32 s3, s5 +; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX12-NEXT: s_mov_b32 s1, s5 -; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 +; GFX12-NEXT: s_mov_b32 s3, s5 ; GFX12-NEXT: s_load_b512 s[4:19], s[0:1], 0x0 +; GFX12-NEXT: s_clause 0x1 +; GFX12-NEXT: s_load_b256 s[20:27], s[2:3], 0x40 ; GFX12-NEXT: s_load_b512 s[36:51], s[2:3], 0x0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: buffer_load_b32 v1, off, s[20:23], null -; GFX12-NEXT: buffer_load_b32 v2, off, s[16:19], null -; GFX12-NEXT: image_sample_lz v3, v0, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-NEXT: buffer_load_b32 v0, off, s[16:19], null +; GFX12-NEXT: buffer_load_b32 v2, off, s[20:23], null ; GFX12-NEXT: buffer_load_b32 v4, off, s[40:43], null -; GFX12-NEXT: image_sample_lz v0, v0, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-NEXT: s_wait_loadcnt 0x2 -; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v1 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v0 +; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX12-NEXT: s_wait_loadcnt 0x1 -; GFX12-NEXT: v_cmp_eq_u32_e64 s0, 0xac0, v2 +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-NEXT: v_dual_mov_b32 v1, 0xbc00bc00 :: v_dual_and_b32 v0, v0, v2 +; GFX12-NEXT: image_sample_lz v3, v1, s[8:15], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 +; GFX12-NEXT: image_sample_lz v1, v1, s[44:51], s[36:39] dmask:0x1 dim:SQ_RSRC_IMG_2D a16 ; GFX12-NEXT: s_wait_samplecnt 0x1 -; GFX12-NEXT: v_cmp_eq_f32_e64 s1, 1.0, v3 +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 1.0, v3 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc_lo ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_eq_u32_e64 s2, 0xac0, v4 -; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo +; GFX12-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0xac0, v4 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_4) | instid1(VALU_DEP_2) +; GFX12-NEXT: v_and_b32_e32 v0, v0, v3 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX12-NEXT: s_wait_samplecnt 0x0 -; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0 -; GFX12-NEXT: s_and_b32 s0, s0, s1 -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: s_and_b32 s0, s0, s2 -; GFX12-NEXT: s_and_b32 s0, s0, vcc_lo -; GFX12-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX12-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX12-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v1 +; GFX12-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX12-NEXT: s_wait_alu 0xfffd +; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX12-NEXT: buffer_store_b32 v0, off, s[24:27], null ; GFX12-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll index 803cae4a7f9cd..7b509c38b285c 100644 --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -57,7 +57,7 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: v_and_b32_e32 v5, 1, v18 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v0, v1 -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] ; GFX9-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-NEXT: ; %bb.1: ; %bb19 ; GFX9-NEXT: v_cvt_f32_u32_e32 v7, v6 @@ -67,41 +67,46 @@ define void @lsr_order_mul24_1(i32 %arg, i32 %arg1, i32 %arg2, ptr addrspace(3) ; GFX9-NEXT: v_rcp_iflag_f32_e32 v7, v7 ; GFX9-NEXT: v_lshlrev_b32_e32 v8, 2, v2 ; GFX9-NEXT: v_add_u32_e32 v9, v17, v12 -; GFX9-NEXT: s_mov_b64 s[10:11], 0 +; GFX9-NEXT: s_mov_b64 s[8:9], 0 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: ; implicit-def: $vgpr3 ; GFX9-NEXT: .LBB1_2: ; %bb23 ; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX9-NEXT: v_cvt_f32_u32_e32 v3, v0 -; GFX9-NEXT: v_add_u32_e32 v12, v17, v0 ; GFX9-NEXT: v_madak_f32 v3, v3, v7, 0x3727c5ac ; GFX9-NEXT: v_cvt_u32_f32_e32 v3, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v18, v3, v5 ; GFX9-NEXT: v_add_u32_e32 v19, v3, v16 +; GFX9-NEXT: v_add_u32_e32 v3, v17, v0 +; GFX9-NEXT: v_sub_u32_e32 v20, v3, v18 ; GFX9-NEXT: v_add_u32_e32 v3, v9, v0 ; GFX9-NEXT: v_sub_u32_e32 v3, v3, v18 -; GFX9-NEXT: v_sub_u32_e32 v12, v12, v18 -; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v19, v13 -; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v19, v15, v[3:4] -; GFX9-NEXT: v_cmp_lt_u32_e64 s[6:7], v12, v14 -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GFX9-NEXT: s_and_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v18, s[4:5] +; GFX9-NEXT: v_cmp_lt_u32_e32 vcc, v19, v13 +; GFX9-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v19, v15, v[3:4] +; GFX9-NEXT: v_cmp_lt_u32_e64 s[4:5], v20, v14 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v19, 0, 1, s[4:5] +; GFX9-NEXT: v_and_b32_e32 v3, v3, v19 +; GFX9-NEXT: v_and_b32_e32 v3, v3, v12 +; GFX9-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v18, vcc ; GFX9-NEXT: v_lshlrev_b64 v[18:19], 2, v[3:4] ; GFX9-NEXT: v_add_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_add_co_u32_e64 v18, s[6:7], v10, v18 -; GFX9-NEXT: v_addc_co_u32_e64 v19, s[6:7], v11, v19, s[6:7] +; GFX9-NEXT: v_add_co_u32_e64 v18, s[4:5], v10, v18 +; GFX9-NEXT: v_addc_co_u32_e64 v19, s[4:5], v11, v19, s[4:5] ; GFX9-NEXT: global_load_dword v3, v[18:19], off -; GFX9-NEXT: v_cmp_ge_u32_e64 s[6:7], v0, v1 -; GFX9-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] +; GFX9-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v1 +; GFX9-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, v3, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc ; GFX9-NEXT: ds_write_b32 v6, v3 ; GFX9-NEXT: v_add_u32_e32 v6, v6, v8 -; GFX9-NEXT: s_andn2_b64 exec, exec, s[10:11] +; GFX9-NEXT: s_andn2_b64 exec, exec, s[8:9] ; GFX9-NEXT: s_cbranch_execnz .LBB1_2 ; GFX9-NEXT: .LBB1_3: ; %Flow2 -; GFX9-NEXT: s_or_b64 exec, exec, s[8:9] +; GFX9-NEXT: s_or_b64 exec, exec, s[6:7] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll index 420539346b400..9c97cbb6a0b1c 100644 --- a/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll +++ b/llvm/test/CodeGen/AMDGPU/nor-divergent-lanemask.ll @@ -9,9 +9,13 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) { ; SDAG-W64-LABEL: test_nor: ; SDAG-W64: ; %bb.0: -; SDAG-W64-NEXT: s_nor_b64 s[0:1], s[0:1], s[2:3] -; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SDAG-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-W64-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-W64-NEXT: v_not_b32_e32 v0, v0 +; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-W64-NEXT: v_and_b32_e32 v0, 1, v0 ; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v0 ; SDAG-W64-NEXT: ; return to shader part epilog ; @@ -24,10 +28,14 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) { ; ; SDAG-W32-LABEL: test_nor: ; SDAG-W32: ; %bb.0: -; SDAG-W32-NEXT: s_nor_b32 s0, s0, s2 -; SDAG-W32-NEXT: s_mov_b32 s1, 0 ; SDAG-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) +; SDAG-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; SDAG-W32-NEXT: s_mov_b32 s1, 0 +; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-W32-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-W32-NEXT: v_not_b32_e32 v0, v0 +; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-W32-NEXT: v_and_b32_e32 v0, 1, v0 ; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; SDAG-W32-NEXT: ; return to shader part epilog ; @@ -48,17 +56,18 @@ define amdgpu_ps i64 @test_nor(i64 inreg %a, i64 inreg %b) { define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) { ; SDAG-W64-LABEL: test_or_two_uses: ; SDAG-W64: ; %bb.0: -; SDAG-W64-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; SDAG-W64-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_2) ; SDAG-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SDAG-W64-NEXT: s_xor_b64 s[0:1], s[0:1], -1 -; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe -; SDAG-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SDAG-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; SDAG-W64-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-W64-NEXT: v_not_b32_e32 v1, v0 +; SDAG-W64-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; SDAG-W64-NEXT: v_and_b32_e32 v1, 1, v1 ; SDAG-W64-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; SDAG-W64-NEXT: s_delay_alu instid0(VALU_DEP_2) ; SDAG-W64-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v1 ; SDAG-W64-NEXT: s_and_b64 s[0:1], s[0:1], vcc -; SDAG-W64-NEXT: s_waitcnt_depctr 0xfffe ; SDAG-W64-NEXT: ; return to shader part epilog ; ; GISEL-W64-LABEL: test_or_two_uses: @@ -73,12 +82,16 @@ define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) { ; ; SDAG-W32-LABEL: test_or_two_uses: ; SDAG-W32: ; %bb.0: -; SDAG-W32-NEXT: s_or_b32 s0, s0, s2 -; SDAG-W32-NEXT: s_mov_b32 s3, 0 ; SDAG-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; SDAG-W32-NEXT: s_xor_b32 s0, s0, -1 +; SDAG-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s2 +; SDAG-W32-NEXT: s_mov_b32 s3, 0 +; SDAG-W32-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; SDAG-W32-NEXT: s_mov_b32 s1, s3 -; SDAG-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; SDAG-W32-NEXT: v_or_b32_e32 v0, v0, v1 +; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; SDAG-W32-NEXT: v_not_b32_e32 v1, v0 +; SDAG-W32-NEXT: v_and_b32_e32 v0, 1, v0 +; SDAG-W32-NEXT: v_and_b32_e32 v1, 1, v1 ; SDAG-W32-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s0, 0, v0 ; SDAG-W32-NEXT: v_cmp_ne_u32_e64 s2, 0, v1 @@ -104,4 +117,4 @@ define amdgpu_ps i64 @test_or_two_uses(i64 inreg %a, i64 inreg %b) { %r1 = call i64 @llvm.amdgcn.ballot.i64(i1 %or) %r = and i64 %r0, %r1 ret i64 %r -} \ No newline at end of file +} diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index cc9650b9a7309..f07195d56fca3 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1253,8 +1253,10 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX6-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX6-NEXT: s_cmp_eq_u32 s2, s3 ; GFX6-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX6-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -1269,8 +1271,10 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8-NEXT: s_cselect_b64 s[0:1], -1, 0 ; GFX8-NEXT: s_cmp_eq_u32 s2, s3 ; GFX8-NEXT: s_cselect_b64 s[2:3], -1, 0 -; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX8-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll index 120aebf2bf7c8..af8bc3e6f1622 100644 --- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll +++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll @@ -1508,59 +1508,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v4, v[2:3], off ; GFX10-NEXT: global_load_dword v9, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 ; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v16, v10 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 -; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v14 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v15, v1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; GFX10-NEXT: v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v12 ; GFX10-NEXT: v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3 -; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 -; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 -; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 +; GFX10-NEXT: v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2 ; GFX10-NEXT: v_ashrrev_i32_e32 v3, 30, v3 +; GFX10-NEXT: v_mul_f32_e32 v16, v19, v16 +; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 +; GFX10-NEXT: v_mul_f32_e32 v15, v2, v15 +; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_trunc_f32_e32 v16, v16 +; GFX10-NEXT: v_trunc_f32_e32 v18, v18 ; GFX10-NEXT: v_mul_f32_e32 v17, v2, v17 -; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_trunc_f32_e32 v15, v15 -; GFX10-NEXT: v_trunc_f32_e32 v16, v16 -; GFX10-NEXT: v_mul_f32_e32 v18, v1, v18 +; GFX10-NEXT: v_ashrrev_i32_e32 v0, 30, v0 +; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 +; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 +; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 ; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 ; GFX10-NEXT: v_mad_f32 v20, -v15, v1, v2 -; GFX10-NEXT: v_mad_f32 v19, -v16, v10, v19 -; GFX10-NEXT: v_or_b32_e32 v3, 1, v3 -; GFX10-NEXT: v_trunc_f32_e32 v18, v18 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| +; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX10-NEXT: v_or_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_mad_f32 v2, -v17, v12, v2 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| -; GFX10-NEXT: v_ashrrev_i32_e32 v13, 30, v13 +; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v3, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| ; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 -; GFX10-NEXT: v_mad_f32 v21, -v18, v14, v1 +; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 ; GFX10-NEXT: v_cvt_i32_f32_e32 v15, v15 -; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10| -; GFX10-NEXT: v_or_b32_e32 v13, 1, v13 -; GFX10-NEXT: v_cvt_i32_f32_e32 v16, v16 ; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v3, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v10, 0, v13, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1| +; GFX10-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc_lo ; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12| -; GFX10-NEXT: v_add_nc_u32_e32 v0, v15, v0 -; GFX10-NEXT: v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14| -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_add_nc_u32_e32 v2, v17, v2 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v13, vcc_lo -; GFX10-NEXT: v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16 v2, v16, v3 +; GFX10-NEXT: v_add_nc_u16 v3, v18, v10 +; GFX10-NEXT: v_add_nc_u16 v0, v15, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v11, vcc_lo +; GFX10-NEXT: v_lshlrev_b16 v2, 8, v2 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_add_nc_u16 v1, v17, v1 +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x60706 ; GFX10-NEXT: global_store_dword v[5:6], v0, off @@ -1628,10 +1630,10 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v9, vcc ; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, |v4| ; GFX9-NEXT: v_cndmask_b32_e32 v2, 0, v14, vcc -; GFX9-NEXT: v_add_u32_e32 v1, v15, v1 -; GFX9-NEXT: v_add_u32_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_add_u32_e32 v3, v17, v3 -; GFX9-NEXT: v_add_u32_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v1, v15, v1 +; GFX9-NEXT: v_add_u16_sdwa v4, v16, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v3, v17, v3 +; GFX9-NEXT: v_add_u16_sdwa v2, v18, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -1854,79 +1856,83 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_and_b32_e32 v4, 0x3ff, v31 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 2, v4 -; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX10-NEXT: global_load_dword v4, v[2:3], off -; GFX10-NEXT: global_load_dword v9, v[0:1], off +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo +; GFX10-NEXT: global_load_dword v4, v[0:1], off +; GFX10-NEXT: global_load_dword v9, v[2:3], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v17, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v18, v13 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_cvt_f32_i32_sdwa v14, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v16, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v10, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_cvt_f32_i32_sdwa v23, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v14 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v22, v16 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v19, v3 -; GFX10-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v20, v15 -; GFX10-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX10-NEXT: v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX10-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 -; GFX10-NEXT: v_ashrrev_i32_e32 v1, 30, v1 -; GFX10-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 -; GFX10-NEXT: v_mul_f32_e32 v17, v3, v17 -; GFX10-NEXT: v_mul_f32_e32 v18, v12, v18 -; GFX10-NEXT: v_mul_f32_e32 v19, v15, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v11, 30, v11 -; GFX10-NEXT: v_or_b32_e32 v1, 1, v1 -; GFX10-NEXT: v_trunc_f32_e32 v17, v17 -; GFX10-NEXT: v_trunc_f32_e32 v18, v18 -; GFX10-NEXT: v_mul_f32_e32 v20, v21, v20 -; GFX10-NEXT: v_trunc_f32_e32 v19, v19 -; GFX10-NEXT: v_ashrrev_i32_e32 v14, 30, v14 -; GFX10-NEXT: v_mad_f32 v22, -v17, v2, v3 -; GFX10-NEXT: v_mad_f32 v12, -v18, v13, v12 -; GFX10-NEXT: v_or_b32_e32 v11, 1, v11 +; GFX10-NEXT: v_xor_b32_sdwa v12, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v21, v10 +; GFX10-NEXT: v_xor_b32_sdwa v18, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX10-NEXT: v_xor_b32_sdwa v2, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX10-NEXT: v_xor_b32_sdwa v15, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX10-NEXT: v_ashrrev_i32_e32 v12, 30, v12 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 8, v9 +; GFX10-NEXT: v_mul_f32_e32 v20, v13, v20 +; GFX10-NEXT: v_mul_f32_e32 v22, v23, v22 +; GFX10-NEXT: v_mul_f32_e32 v19, v10, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v18, 30, v18 +; GFX10-NEXT: v_or_b32_e32 v12, 1, v12 ; GFX10-NEXT: v_trunc_f32_e32 v20, v20 -; GFX10-NEXT: v_mad_f32 v23, -v19, v3, v15 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2| -; GFX10-NEXT: v_ashrrev_i32_e32 v16, 30, v16 -; GFX10-NEXT: v_or_b32_e32 v14, 1, v14 -; GFX10-NEXT: v_mad_f32 v21, -v20, v15, v21 -; GFX10-NEXT: v_cvt_i32_f32_e32 v17, v17 -; GFX10-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13| -; GFX10-NEXT: v_or_b32_e32 v16, 1, v16 -; GFX10-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_trunc_f32_e32 v22, v22 +; GFX10-NEXT: v_mul_f32_e32 v21, v16, v21 +; GFX10-NEXT: v_trunc_f32_e32 v19, v19 +; GFX10-NEXT: v_ashrrev_i32_e32 v2, 30, v2 +; GFX10-NEXT: v_mad_f32 v13, -v20, v14, v13 +; GFX10-NEXT: v_mad_f32 v23, -v22, v16, v23 +; GFX10-NEXT: v_or_b32_e32 v18, 1, v18 +; GFX10-NEXT: v_trunc_f32_e32 v21, v21 +; GFX10-NEXT: v_mad_f32 v24, -v19, v3, v10 +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v13|, |v14| +; GFX10-NEXT: v_ashrrev_i32_e32 v15, 30, v15 +; GFX10-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX10-NEXT: v_mad_f32 v25, -v21, v10, v16 ; GFX10-NEXT: v_cvt_i32_f32_e32 v20, v20 -; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3| +; GFX10-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v23|, |v16| +; GFX10-NEXT: v_or_b32_e32 v15, 1, v15 +; GFX10-NEXT: v_cvt_i32_f32_e32 v22, v22 +; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v9 +; GFX10-NEXT: v_cvt_i32_f32_e32 v19, v19 +; GFX10-NEXT: v_cndmask_b32_e32 v13, 0, v18, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v24|, |v3| +; GFX10-NEXT: v_cvt_i32_f32_e32 v21, v21 ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX10-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v1, v17, v1 -; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15| -; GFX10-NEXT: v_add_nc_u32_e32 v2, v18, v2 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v4 -; GFX10-NEXT: v_add_nc_u32_e32 v3, v19, v3 -; GFX10-NEXT: v_cndmask_b32_e32 v11, 0, v16, vcc_lo -; GFX10-NEXT: v_mul_lo_u32 v2, v2, v10 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, v0 -; GFX10-NEXT: v_add_nc_u32_e32 v11, v20, v11 -; GFX10-NEXT: v_sub_nc_u32_e32 v0, v0, v1 -; GFX10-NEXT: v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX10-NEXT: v_mul_lo_u32 v10, v11, v12 -; GFX10-NEXT: v_sub_nc_u32_e32 v2, v12, v3 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v1, v4, v9, 0x2070306 +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 16, v9 +; GFX10-NEXT: v_cndmask_b32_e32 v2, 0, v2, vcc_lo +; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v25|, |v10| +; GFX10-NEXT: v_add_nc_u16 v10, v20, v12 +; GFX10-NEXT: v_add_nc_u16 v12, v22, v13 +; GFX10-NEXT: v_lshrrev_b32_e32 v13, 24, v4 +; GFX10-NEXT: v_add_nc_u16 v2, v19, v2 +; GFX10-NEXT: v_cndmask_b32_e32 v3, 0, v15, vcc_lo +; GFX10-NEXT: v_mul_lo_u16 v10, v10, v11 +; GFX10-NEXT: v_mul_lo_u16 v11, v12, v17 +; GFX10-NEXT: v_mul_lo_u16 v2, v2, v9 +; GFX10-NEXT: v_add_nc_u16 v3, v21, v3 +; GFX10-NEXT: v_sub_nc_u16 v10, v13, v10 +; GFX10-NEXT: v_sub_nc_u16 v0, v0, v11 +; GFX10-NEXT: v_mul_lo_u16 v3, v3, v1 +; GFX10-NEXT: v_sub_nc_u16 v1, v1, v2 +; GFX10-NEXT: v_lshlrev_b16 v0, 8, v0 +; GFX10-NEXT: v_sub_nc_u16 v2, v17, v3 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v10 +; GFX10-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX10-NEXT: v_perm_b32 v1, v9, v4, 0x2070306 ; GFX10-NEXT: global_store_dword v[5:6], v0, off ; GFX10-NEXT: global_store_dword v[7:8], v1, off ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -1944,74 +1950,72 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x2070306 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v10 -; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 -; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v13 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v19, v2 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 +; GFX9-NEXT: v_mul_f32_e32 v18, v12, v18 ; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, |v14| -; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v3 -; GFX9-NEXT: v_mul_f32_e32 v14, v16, v19 -; GFX9-NEXT: v_trunc_f32_e32 v14, v14 -; GFX9-NEXT: v_mad_f32 v19, -v14, v10, v16 -; GFX9-NEXT: v_mul_f32_e32 v13, v10, v13 +; GFX9-NEXT: v_mad_f32 v12, -v18, v13, v12 +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v12|, |v13| +; GFX9-NEXT: v_mul_f32_e32 v13, v3, v19 ; GFX9-NEXT: v_trunc_f32_e32 v13, v13 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v10| -; GFX9-NEXT: v_mad_f32 v10, -v13, v3, v10 -; GFX9-NEXT: v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 -; GFX9-NEXT: v_cmp_ge_f32_e64 s[6:7], |v10|, |v3| -; GFX9-NEXT: v_rcp_iflag_f32_e32 v3, v16 -; GFX9-NEXT: v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 -; GFX9-NEXT: v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 -; GFX9-NEXT: v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 -; GFX9-NEXT: v_mul_f32_e32 v3, v19, v3 -; GFX9-NEXT: v_trunc_f32_e32 v3, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v12, 30, v12 -; GFX9-NEXT: v_xor_b32_sdwa v10, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_mad_f32 v19, -v13, v2, v3 +; GFX9-NEXT: v_cvt_f32_i32_sdwa v17, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, |v2| +; GFX9-NEXT: v_rcp_iflag_f32_e32 v2, v15 +; GFX9-NEXT: v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1 +; GFX9-NEXT: v_mul_f32_e32 v12, v15, v12 +; GFX9-NEXT: v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2 +; GFX9-NEXT: v_mul_f32_e32 v2, v17, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v11, 30, v11 +; GFX9-NEXT: v_trunc_f32_e32 v12, v12 +; GFX9-NEXT: v_trunc_f32_e32 v2, v2 +; GFX9-NEXT: v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0 +; GFX9-NEXT: v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3 +; GFX9-NEXT: v_ashrrev_i32_e32 v14, 30, v14 +; GFX9-NEXT: v_or_b32_e32 v11, 1, v11 ; GFX9-NEXT: v_cvt_i32_f32_e32 v13, v13 ; GFX9-NEXT: v_cvt_i32_f32_e32 v18, v18 -; GFX9-NEXT: v_cvt_i32_f32_e32 v14, v14 -; GFX9-NEXT: v_mad_f32 v19, -v3, v16, v19 -; GFX9-NEXT: v_cvt_i32_f32_e32 v3, v3 -; GFX9-NEXT: v_ashrrev_i32_e32 v15, 30, v15 -; GFX9-NEXT: v_or_b32_e32 v12, 1, v12 -; GFX9-NEXT: v_ashrrev_i32_e32 v2, 30, v2 -; GFX9-NEXT: v_ashrrev_i32_e32 v10, 30, v10 -; GFX9-NEXT: v_or_b32_e32 v15, 1, v15 -; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_or_b32_e32 v10, 1, v10 -; GFX9-NEXT: v_cndmask_b32_e32 v12, 0, v12, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v16| -; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, v15, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v11, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v17, 24, v4 -; GFX9-NEXT: v_add_u32_e32 v2, v13, v2 -; GFX9-NEXT: v_add_u32_e32 v12, v18, v12 -; GFX9-NEXT: v_add_u32_e32 v13, v14, v15 -; GFX9-NEXT: v_add_u32_e32 v3, v3, v10 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v4 -; GFX9-NEXT: v_mul_lo_u32 v4, v12, v11 -; GFX9-NEXT: v_mul_lo_u32 v10, v13, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v3, v17 -; GFX9-NEXT: v_sub_u32_e32 v0, v0, v2 -; GFX9-NEXT: v_sub_u32_sdwa v2, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_e32 v4, v17, v10 -; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_mad_f32 v19, -v12, v3, v15 +; GFX9-NEXT: v_cvt_i32_f32_e32 v12, v12 +; GFX9-NEXT: v_mad_f32 v17, -v2, v15, v17 +; GFX9-NEXT: v_cvt_i32_f32_e32 v2, v2 +; GFX9-NEXT: v_ashrrev_i32_e32 v1, 30, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v16, 30, v16 +; GFX9-NEXT: v_or_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, |v3| +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_or_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v3, 0, v14, vcc +; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v17|, |v15| +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v14, 0, v16, vcc +; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v13, v1 +; GFX9-NEXT: v_add_u16_e32 v11, v18, v11 +; GFX9-NEXT: v_add_u16_e32 v3, v12, v3 +; GFX9-NEXT: v_add_u16_e32 v2, v2, v14 +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v4 +; GFX9-NEXT: v_mul_lo_u16_e32 v10, v11, v10 +; GFX9-NEXT: v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v2, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_sub_u16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_sdwa v10, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_sdwa v2, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -2226,45 +2230,49 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v14, v0 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v15, v0 ; GFX10-NEXT: v_perm_b32 v0, v0, v2, 0x40207 -; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10 ; GFX10-NEXT: v_mul_f32_e32 v11, v4, v11 ; GFX10-NEXT: v_mul_f32_e32 v13, v1, v13 +; GFX10-NEXT: v_mul_f32_e32 v10, v14, v10 ; GFX10-NEXT: v_mul_f32_e32 v12, v15, v12 -; GFX10-NEXT: v_trunc_f32_e32 v10, v10 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13 +; GFX10-NEXT: v_trunc_f32_e32 v10, v10 ; GFX10-NEXT: v_trunc_f32_e32 v12, v12 -; GFX10-NEXT: v_mad_f32 v14, -v10, v1, v14 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX10-NEXT: v_mad_f32 v16, -v11, v3, v4 ; GFX10-NEXT: v_mad_f32 v17, -v13, v9, v1 +; GFX10-NEXT: v_mad_f32 v14, -v10, v1, v14 +; GFX10-NEXT: v_mad_f32 v15, -v12, v4, v15 ; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v14|, v1 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v16|, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GFX10-NEXT: v_mad_f32 v15, -v12, v4, v15 +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 ; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v16|, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v17|, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v17|, v9 +; GFX10-NEXT: v_add_nc_u16 v3, v11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v14|, v1 ; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v4 -; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16 v9, v13, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v15|, v4 ; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo +; GFX10-NEXT: v_add_nc_u16 v1, v10, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX10-NEXT: v_add_nc_u16 v4, v12, v4 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off @@ -2285,43 +2293,45 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt, ; GFX9-NEXT: s_mov_b32 s4, 0x40207 ; GFX9-NEXT: s_waitcnt vmcnt(1) ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2 ; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v11, v2 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9 -; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10 -; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11 ; GFX9-NEXT: v_perm_b32 v0, v9, v4, s4 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 ; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v4, v4 -; GFX9-NEXT: v_trunc_f32_e32 v11, v11 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v12, v3 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v10 ; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v4 -; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12 -; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1 -; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v1, v9 +; GFX9-NEXT: v_mul_f32_e32 v11, v1, v11 ; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v9, v9 -; GFX9-NEXT: v_trunc_f32_e32 v12, v12 +; GFX9-NEXT: v_mul_f32_e32 v12, v10, v12 +; GFX9-NEXT: v_trunc_f32_e32 v11, v11 ; GFX9-NEXT: v_mul_f32_e32 v13, v9, v13 +; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14 +; GFX9-NEXT: v_trunc_f32_e32 v12, v12 +; GFX9-NEXT: v_mad_f32 v1, -v11, v2, v1 +; GFX9-NEXT: v_trunc_f32_e32 v13, v13 +; GFX9-NEXT: v_trunc_f32_e32 v14, v14 ; GFX9-NEXT: v_mad_f32 v15, -v12, v3, v10 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v1|, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v11, v11 ; GFX9-NEXT: v_cvt_u32_f32_e32 v12, v12 -; GFX9-NEXT: v_trunc_f32_e32 v13, v13 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v1|, v2 -; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14 ; GFX9-NEXT: v_mad_f32 v9, -v13, v10, v9 ; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v11, vcc -; GFX9-NEXT: v_trunc_f32_e32 v14, v14 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v15|, v3 ; GFX9-NEXT: v_mad_f32 v16, -v14, v4, v2 ; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v12, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v9|, v10 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v13, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v16|, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v14, vcc -; GFX9-NEXT: v_lshlrev_b16_e32 v2, 8, v2 -; GFX9-NEXT: v_lshlrev_b16_e32 v4, 8, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v15|, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v9|, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v16|, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[4:5] +; GFX9-NEXT: v_add_u16_e32 v1, v11, v1 +; GFX9-NEXT: v_add_u16_sdwa v2, v12, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_add_u16_e32 v3, v13, v3 +; GFX9-NEXT: v_add_u16_sdwa v4, v14, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD @@ -2414,53 +2424,60 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX10-NEXT: global_load_dword v2, v[2:3], off ; GFX10-NEXT: global_load_dword v0, v[0:1], off ; GFX10-NEXT: s_waitcnt vmcnt(1) -; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 ; GFX10-NEXT: v_cvt_f32_ubyte1_e32 v3, v2 -; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 ; GFX10-NEXT: v_cvt_f32_ubyte3_e32 v9, v2 +; GFX10-NEXT: v_cvt_f32_ubyte0_e32 v1, v2 +; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v4, v2 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cvt_f32_ubyte2_e32 v15, v0 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v11, v3 -; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 ; GFX10-NEXT: v_rcp_iflag_f32_e32 v13, v9 -; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v2 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v10, v1 +; GFX10-NEXT: v_rcp_iflag_f32_e32 v12, v4 ; GFX10-NEXT: v_lshrrev_b32_e32 v16, 8, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v17, 24, v2 -; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10 +; GFX10-NEXT: v_lshrrev_b32_e32 v14, 16, v0 +; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505 ; GFX10-NEXT: v_mul_f32_e32 v11, v3, v11 -; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12 ; GFX10-NEXT: v_mul_f32_e32 v13, v15, v13 -; GFX10-NEXT: v_trunc_f32_e32 v10, v10 +; GFX10-NEXT: v_mul_f32_e32 v10, v3, v10 +; GFX10-NEXT: v_mul_f32_e32 v12, v3, v12 ; GFX10-NEXT: v_trunc_f32_e32 v11, v11 -; GFX10-NEXT: v_trunc_f32_e32 v12, v12 ; GFX10-NEXT: v_trunc_f32_e32 v13, v13 -; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GFX10-NEXT: v_trunc_f32_e32 v10, v10 +; GFX10-NEXT: v_trunc_f32_e32 v12, v12 ; GFX10-NEXT: v_mad_f32 v19, -v11, v3, v3 -; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 -; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3 -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v18|, v1 -; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 ; GFX10-NEXT: v_mad_f32 v15, -v13, v9, v15 +; GFX10-NEXT: v_mad_f32 v18, -v10, v1, v3 +; GFX10-NEXT: v_mad_f32 v20, -v12, v4, v3 +; GFX10-NEXT: v_cvt_u32_f32_e32 v11, v11 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v19|, v3 ; GFX10-NEXT: v_cvt_u32_f32_e32 v13, v13 -; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v19|, v3 -; GFX10-NEXT: v_mul_lo_u32 v1, v1, v2 -; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v20|, v4 -; GFX10-NEXT: v_mul_lo_u32 v3, v3, v16 -; GFX10-NEXT: v_sub_nc_u32_e32 v1, v16, v1 -; GFX10-NEXT: v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo -; GFX10-NEXT: v_cmp_ge_f32_e64 vcc_lo, |v15|, v9 -; GFX10-NEXT: v_mul_lo_u32 v4, v4, v14 -; GFX10-NEXT: v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo +; GFX10-NEXT: v_cvt_u32_f32_e32 v10, v10 +; GFX10-NEXT: v_cvt_u32_f32_e32 v12, v12 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v15|, v9 +; GFX10-NEXT: v_add_nc_u16 v3, v11, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v9, 0, 1, s4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v18|, v1 +; GFX10-NEXT: v_lshrrev_b32_e32 v11, 16, v2 +; GFX10-NEXT: v_mul_lo_u16 v3, v3, v16 +; GFX10-NEXT: v_add_nc_u16 v9, v13, v9 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: v_cmp_ge_f32_e64 s4, |v20|, v4 +; GFX10-NEXT: v_sub_nc_u16 v3, v16, v3 +; GFX10-NEXT: v_mul_lo_u16 v9, v9, v17 +; GFX10-NEXT: v_add_nc_u16 v1, v10, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: v_lshlrev_b16 v3, 8, v3 +; GFX10-NEXT: v_sub_nc_u16 v9, v14, v9 +; GFX10-NEXT: v_mul_lo_u16 v1, v1, v2 +; GFX10-NEXT: v_add_nc_u16 v4, v12, v4 +; GFX10-NEXT: v_lshlrev_b16 v9, 8, v9 +; GFX10-NEXT: v_sub_nc_u16 v1, v16, v1 +; GFX10-NEXT: v_mul_lo_u16 v4, v4, v11 ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX10-NEXT: v_mul_lo_u32 v9, v9, v17 -; GFX10-NEXT: v_sub_nc_u32_e32 v4, v16, v4 -; GFX10-NEXT: v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX10-NEXT: v_perm_b32 v0, v2, v0, 0x2050505 +; GFX10-NEXT: v_sub_nc_u16 v4, v16, v4 ; GFX10-NEXT: v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX10-NEXT: global_store_dword v[5:6], v1, off @@ -2480,57 +2497,59 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1, ; GFX9-NEXT: global_load_dword v9, v[0:1], off ; GFX9-NEXT: s_mov_b32 s4, 0x2050505 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v2, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v2 -; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v3, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v3 -; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v4 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v17, v11 -; GFX9-NEXT: v_mul_f32_e32 v15, v3, v15 -; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v14, v4 +; GFX9-NEXT: v_cvt_f32_ubyte0_e32 v1, v4 +; GFX9-NEXT: v_cvt_f32_ubyte1_e32 v2, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v13, v1 +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v10, v4 +; GFX9-NEXT: v_cvt_f32_ubyte3_e32 v12, v4 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v14, v2 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v15, v10 +; GFX9-NEXT: v_rcp_iflag_f32_e32 v16, v12 +; GFX9-NEXT: v_mul_f32_e32 v13, v2, v13 +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v11, v9 +; GFX9-NEXT: v_mul_f32_e32 v14, v2, v14 +; GFX9-NEXT: v_trunc_f32_e32 v13, v13 +; GFX9-NEXT: v_mul_f32_e32 v15, v2, v15 +; GFX9-NEXT: v_mul_f32_e32 v16, v11, v16 +; GFX9-NEXT: v_trunc_f32_e32 v14, v14 +; GFX9-NEXT: v_mad_f32 v17, -v13, v1, v2 +; GFX9-NEXT: v_perm_b32 v0, v4, v9, s4 ; GFX9-NEXT: v_trunc_f32_e32 v15, v15 -; GFX9-NEXT: v_rcp_iflag_f32_e32 v18, v14 -; GFX9-NEXT: v_mul_f32_e32 v16, v3, v16 -; GFX9-NEXT: v_mad_f32 v19, -v15, v2, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 ; GFX9-NEXT: v_trunc_f32_e32 v16, v16 -; GFX9-NEXT: v_mul_f32_e32 v17, v3, v17 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v2 -; GFX9-NEXT: v_mad_f32 v2, -v16, v3, v3 +; GFX9-NEXT: v_mad_f32 v18, -v14, v2, v2 +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v17|, v1 +; GFX9-NEXT: v_cvt_u32_f32_e32 v13, v13 +; GFX9-NEXT: v_cvt_u32_f32_e32 v14, v14 +; GFX9-NEXT: v_mad_f32 v19, -v15, v10, v2 +; GFX9-NEXT: v_cvt_u32_f32_e32 v15, v15 +; GFX9-NEXT: v_mad_f32 v11, -v16, v12, v11 ; GFX9-NEXT: v_cvt_u32_f32_e32 v16, v16 -; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cvt_f32_ubyte2_e32 v13, v9 -; GFX9-NEXT: v_trunc_f32_e32 v17, v17 -; GFX9-NEXT: v_mul_f32_e32 v18, v13, v18 -; GFX9-NEXT: v_mad_f32 v19, -v17, v11, v3 -; GFX9-NEXT: v_cvt_u32_f32_e32 v17, v17 -; GFX9-NEXT: v_addc_co_u32_e32 v15, vcc, 0, v15, vcc -; GFX9-NEXT: v_trunc_f32_e32 v18, v18 -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v2|, v3 -; GFX9-NEXT: v_mad_f32 v13, -v18, v14, v13 -; GFX9-NEXT: v_cvt_u32_f32_e32 v18, v18 -; GFX9-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v16, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v19|, v11 -; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v17, vcc -; GFX9-NEXT: v_cmp_ge_f32_e64 vcc, |v13|, v14 -; GFX9-NEXT: v_lshrrev_b32_e32 v0, 16, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v10, 8, v4 -; GFX9-NEXT: v_lshrrev_b32_e32 v12, 24, v4 -; GFX9-NEXT: v_addc_co_u32_e32 v11, vcc, 0, v18, vcc -; GFX9-NEXT: v_perm_b32 v1, v4, v9, s4 -; GFX9-NEXT: v_mul_lo_u32 v4, v15, v4 -; GFX9-NEXT: v_mul_lo_u32 v2, v2, v10 -; GFX9-NEXT: v_mul_lo_u32 v0, v3, v0 -; GFX9-NEXT: v_mul_lo_u32 v3, v11, v12 -; GFX9-NEXT: v_sub_u32_e32 v4, v10, v4 -; GFX9-NEXT: v_sub_u32_sdwa v2, v10, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX9-NEXT: v_sub_u32_e32 v0, v10, v0 -; GFX9-NEXT: v_sub_u32_sdwa v3, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v2, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX9-NEXT: v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD -; GFX9-NEXT: global_store_dword v[5:6], v0, off -; GFX9-NEXT: global_store_dword v[7:8], v1, off +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v18|, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v19|, v10 +; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5] +; GFX9-NEXT: v_cmp_ge_f32_e64 s[4:5], |v11|, v12 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5] +; GFX9-NEXT: v_lshrrev_b32_e32 v3, 8, v4 +; GFX9-NEXT: v_add_u16_e32 v1, v13, v1 +; GFX9-NEXT: v_add_u16_e32 v2, v14, v2 +; GFX9-NEXT: v_add_u16_e32 v10, v15, v10 +; GFX9-NEXT: v_add_u16_e32 v11, v16, v11 +; GFX9-NEXT: v_mul_lo_u16_e32 v1, v1, v4 +; GFX9-NEXT: v_mul_lo_u16_e32 v2, v2, v3 +; GFX9-NEXT: v_mul_lo_u16_sdwa v10, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_mul_lo_u16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3 +; GFX9-NEXT: v_sub_u16_e32 v1, v3, v1 +; GFX9-NEXT: v_sub_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; GFX9-NEXT: v_sub_u16_e32 v3, v3, v10 +; GFX9-NEXT: v_sub_u16_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD +; GFX9-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD +; GFX9-NEXT: global_store_dword v[5:6], v1, off +; GFX9-NEXT: global_store_dword v[7:8], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll index 24a4d8fbde200..5bbf9abdc36b6 100644 --- a/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll +++ b/llvm/test/CodeGen/AMDGPU/private-memory-atomics.ll @@ -621,10 +621,14 @@ define i32 @atomicrmw_dec_private_i32(ptr addrspace(5) %ptr) { ; GCN-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_add_i32_e32 v2, vcc, -1, v1 +; GCN-NEXT: v_cmp_lt_u32_e32 vcc, 4, v1 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 -; GCN-NEXT: v_cmp_lt_u32_e64 s[4:5], 4, v1 -; GCN-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 4, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GCN-NEXT: v_cndmask_b32_e64 v2, v2, 4, vcc ; GCN-NEXT: buffer_store_dword v2, v0, s[0:3], 0 offen ; GCN-NEXT: v_mov_b32_e32 v0, v1 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/rem_i128.ll b/llvm/test/CodeGen/AMDGPU/rem_i128.ll index 6512bee36e88b..6820d8aa1a047 100644 --- a/llvm/test/CodeGen/AMDGPU/rem_i128.ll +++ b/llvm/test/CodeGen/AMDGPU/rem_i128.ll @@ -12,30 +12,31 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v0 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v1, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v2, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v3, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v2, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v3, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] ; GFX9-NEXT: v_ashrrev_i32_e32 v20, 31, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v11, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, 0, v4 ; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v5, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v6, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v7, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v6, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v12, vcc, 0, v7, vcc ; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_mov_b32_e32 v21, v20 +; GFX9-NEXT: v_ffbh_u32_e32 v13, v1 ; GFX9-NEXT: v_cndmask_b32_e32 v22, v5, v9, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v23, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v11, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v10, vcc +; GFX9-NEXT: v_or_b32_e32 v9, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v8, v0, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v5, v7, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v6, v11, vcc +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] ; GFX9-NEXT: v_or_b32_e32 v7, v22, v5 ; GFX9-NEXT: v_or_b32_e32 v6, v23, v4 +; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] -; GFX9-NEXT: v_or_b32_e32 v7, v1, v3 -; GFX9-NEXT: v_or_b32_e32 v6, v0, v2 -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] ; GFX9-NEXT: v_ffbh_u32_e32 v6, v4 ; GFX9-NEXT: v_add_u32_e32 v6, 32, v6 ; GFX9-NEXT: v_ffbh_u32_e32 v7, v5 @@ -44,51 +45,54 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 ; GFX9-NEXT: v_ffbh_u32_e32 v8, v22 ; GFX9-NEXT: v_min_u32_e32 v7, v7, v8 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 64, v7 -; GFX9-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v8, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5] -; GFX9-NEXT: v_ffbh_u32_e32 v10, v3 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v7, v6, vcc ; GFX9-NEXT: v_ffbh_u32_e32 v7, v2 ; GFX9-NEXT: v_add_u32_e32 v7, 32, v7 -; GFX9-NEXT: v_min_u32_e32 v7, v7, v10 -; GFX9-NEXT: v_ffbh_u32_e32 v10, v0 -; GFX9-NEXT: v_add_u32_e32 v10, 32, v10 -; GFX9-NEXT: v_ffbh_u32_e32 v11, v1 -; GFX9-NEXT: v_min_u32_e32 v10, v10, v11 +; GFX9-NEXT: v_min_u32_e32 v7, v7, v9 +; GFX9-NEXT: v_ffbh_u32_e32 v9, v0 +; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 +; GFX9-NEXT: v_min_u32_e32 v9, v9, v13 ; GFX9-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc -; GFX9-NEXT: v_add_co_u32_e32 v10, vcc, 64, v10 -; GFX9-NEXT: v_addc_co_u32_e64 v11, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 +; GFX9-NEXT: v_addc_co_u32_e64 v13, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v7, v10, v7, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v10, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v7, v9, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v13, 0, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v6, vcc, v6, v7 -; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v11, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v9, vcc -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v9, vcc -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[6:7] -; GFX9-NEXT: v_or_b32_e32 v12, v7, v9 +; GFX9-NEXT: v_subb_co_u32_e32 v7, vcc, v8, v13, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v8, vcc, 0, v10, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, 0, v10, vcc +; GFX9-NEXT: s_mov_b64 s[4:5], 0x7f +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7] +; GFX9-NEXT: v_or_b32_e32 v15, v7, v9 ; GFX9-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX9-NEXT: v_mov_b32_e32 v21, v20 +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v10, v13, v10, vcc +; GFX9-NEXT: v_or3_b32 v10, v12, v11, v10 +; GFX9-NEXT: v_xor_b32_e32 v16, 1, v10 ; GFX9-NEXT: v_and_b32_e32 v10, 1, v10 ; GFX9-NEXT: v_xor_b32_e32 v11, 0x7f, v6 +; GFX9-NEXT: v_or_b32_e32 v14, v11, v8 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 -; GFX9-NEXT: v_or_b32_e32 v11, v11, v8 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[11:12] -; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v10, v3, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15] +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX9-NEXT: v_and_b32_e32 v14, v16, v14 +; GFX9-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB0_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v24, vcc, 1, v6 @@ -240,23 +244,20 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0: ; %bb.0: ; %_udiv-special-cases ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v1 -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec @@ -283,291 +284,295 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v12 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: v_writelane_b32 v30, s10, 2 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_writelane_b32 v30, s11, 3 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v9, vcc, s10, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 1 +; GFX9-O0-NEXT: s_mov_b32 s8, s4 +; GFX9-O0-NEXT: v_writelane_b32 v30, s8, 2 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_writelane_b32 v30, s9, 3 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v7, vcc, s8, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v5, vcc, v4, v3, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v4, vcc, v4, v0, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v6, v1, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[11:12], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[6:7], v[11:12], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v3 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[6:7] ; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v18 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, s10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v10, v9, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v10, v7, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v12, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v12, v8, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v18 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v13, vcc, s8, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v11, vcc, v9, v10, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v12, vcc, v9, v4, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v9, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v9, v7, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v11 ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 -; GFX9-O0-NEXT: s_mov_b64 s[4:5], s[6:7] -; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[4:5], v[17:18], s[4:5] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v13 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: v_mov_b32_e32 v17, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v18, v9 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v12, s[4:5] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 killed $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, v7, v10, s[4:5] -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v10 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v18 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v16 -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v18 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v17 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[12:13], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v12, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v15 -; GFX9-O0-NEXT: v_or_b32_e64 v12, v12, v13 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_lt_i64_e64 s[6:7], v[17:18], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, v10, v11, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[10:11], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: s_mov_b32 s13, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v7, v7, s13 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v7, v9, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, v4, v7, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v12 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v12 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v18 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v17 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v5 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v20, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v19, v15 +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v18 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v17 +; GFX9-O0-NEXT: v_or_b32_e64 v17, v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v18, v4 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[17:18], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v4, v16 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v7 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v15 +; GFX9-O0-NEXT: v_or_b32_e64 v14, v7, v14 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], v[14:15], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[6:7] +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[12:13], s[6:7] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: s_mov_b32 s11, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v9, v9, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v10 +; GFX9-O0-NEXT: v_min_u32_e64 v9, v9, v10 +; GFX9-O0-NEXT: s_mov_b32 s10, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v10 ; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 -; GFX9-O0-NEXT: v_min_u32_e64 v7, v7, v8 -; GFX9-O0-NEXT: s_mov_b32 s12, 0 -; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v10 +; GFX9-O0-NEXT: v_add_u32_e64 v8, v8, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v11, v11 +; GFX9-O0-NEXT: v_min_u32_e64 v14, v8, v11 +; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v8 +; GFX9-O0-NEXT: s_mov_b64 s[12:13], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v14 +; GFX9-O0-NEXT: s_mov_b32 s14, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v15 +; GFX9-O0-NEXT: s_mov_b32 s16, s13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v11, s[14:15], v11, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, s16 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v8, s[14:15], v8, v12, s[14:15] +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v8 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v8, v13, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v11 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v9, v10, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v8 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 -; GFX9-O0-NEXT: v_min_u32_e64 v12, v6, v9 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[5:6], s[6:7] +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v5, v6 ; GFX9-O0-NEXT: ; implicit-def: $sgpr14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s11 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v12, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v13, v5, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr11 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 +; GFX9-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v13 +; GFX9-O0-NEXT: s_mov_b32 s10, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v14 +; GFX9-O0-NEXT: s_mov_b32 s12, s13 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[10:11], v12, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[10:11], v5, v13, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v6, v8, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v6 -; GFX9-O0-NEXT: s_mov_b64 s[14:15], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 -; GFX9-O0-NEXT: s_mov_b32 s16, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v13 -; GFX9-O0-NEXT: s_mov_b32 s18, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[16:17], v9, s16 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, s18 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v6, s[16:17], v6, v10, s[16:17] -; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v11, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v7 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v9 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v7, v8, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr16 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[5:6], s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[8:9], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7] +; GFX9-O0-NEXT: v_or3_b32 v7, v4, v7, v10 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v7, s6 +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s12, s11 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s12 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s10 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s13 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v11, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr13 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v11 -; GFX9-O0-NEXT: s_mov_b32 s12, s14 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v12 -; GFX9-O0-NEXT: s_mov_b32 s14, s15 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[12:13], v10, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v11, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v3, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, 1 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -577,51 +582,51 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 4 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_3 ; GFX9-O0-NEXT: s_branch .LBB0_8 ; GFX9-O0-NEXT: .LBB0_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: ; %bb.2: ; %Flow -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(7) -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_5 ; GFX9-O0-NEXT: .LBB0_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -631,22 +636,22 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_9 ; GFX9-O0-NEXT: .LBB0_4: ; %udiv-loop-exit -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 1 ; GFX9-O0-NEXT: s_waitcnt vmcnt(2) ; GFX9-O0-NEXT: v_lshlrev_b64 v[2:3], s4, v[0:1] @@ -679,67 +684,67 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_3 ; GFX9-O0-NEXT: .LBB0_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 9 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(1) -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_4 ; GFX9-O0-NEXT: .LBB0_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 10 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 11 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 63 ; GFX9-O0-NEXT: s_waitcnt vmcnt(16) ; GFX9-O0-NEXT: v_lshrrev_b64 v[28:29], s4, v[2:3] @@ -880,72 +885,72 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v3 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v2 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v0 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v15 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v14 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v13 ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v12 -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execnz .LBB0_6 ; GFX9-O0-NEXT: s_branch .LBB0_1 ; GFX9-O0-NEXT: .LBB0_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -1029,46 +1034,46 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v12, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v15, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v14, s4 -; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 10 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 11 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] -; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] +; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB0_6 ; GFX9-O0-NEXT: .LBB0_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -1106,14 +1111,14 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b32 s4, 0x7f ; GFX9-O0-NEXT: v_sub_u32_e64 v2, s4, v3 ; GFX9-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[10:11] @@ -1159,12 +1164,12 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v2 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v9 ; GFX9-O0-NEXT: v_or_b32_e64 v2, v2, v3 @@ -1178,26 +1183,26 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-O0-NEXT: v_mov_b32_e32 v3, s7 -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 s[6:7], exec ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[22:23], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[22:23] +; GFX9-O0-NEXT: s_mov_b64 exec, s[20:21] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB0_5 ; GFX9-O0-NEXT: s_branch .LBB0_7 @@ -1212,10 +1217,10 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b32 s4, 32 @@ -1489,7 +1494,7 @@ define i128 @v_srem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4] ; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -1501,12 +1506,13 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-LABEL: v_urem_i128_vv: ; GFX9: ; %bb.0: ; %_udiv-special-cases ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_e32 v11, v1, v3 +; GFX9-NEXT: v_or_b32_e32 v10, v0, v2 +; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] ; GFX9-NEXT: v_or_b32_e32 v9, v5, v7 ; GFX9-NEXT: v_or_b32_e32 v8, v4, v6 +; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9] -; GFX9-NEXT: v_or_b32_e32 v9, v1, v3 -; GFX9-NEXT: v_or_b32_e32 v8, v0, v2 -; GFX9-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9] ; GFX9-NEXT: v_ffbh_u32_e32 v8, v6 ; GFX9-NEXT: v_add_u32_e32 v8, 32, v8 ; GFX9-NEXT: v_ffbh_u32_e32 v9, v7 @@ -1515,9 +1521,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_add_u32_e32 v9, 32, v9 ; GFX9-NEXT: v_ffbh_u32_e32 v10, v5 ; GFX9-NEXT: v_min_u32_e32 v9, v9, v10 -; GFX9-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; GFX9-NEXT: v_add_co_u32_e32 v9, vcc, 64, v9 -; GFX9-NEXT: v_addc_co_u32_e64 v10, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7] ; GFX9-NEXT: v_ffbh_u32_e32 v11, v3 ; GFX9-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc @@ -1526,40 +1532,43 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-NEXT: v_min_u32_e32 v9, v9, v11 ; GFX9-NEXT: v_ffbh_u32_e32 v11, v0 ; GFX9-NEXT: v_add_u32_e32 v11, 32, v11 -; GFX9-NEXT: v_ffbh_u32_e32 v12, v1 -; GFX9-NEXT: v_min_u32_e32 v11, v11, v12 +; GFX9-NEXT: v_ffbh_u32_e32 v14, v1 +; GFX9-NEXT: v_min_u32_e32 v11, v11, v14 ; GFX9-NEXT: v_cndmask_b32_e64 v10, v10, 0, vcc ; GFX9-NEXT: v_add_co_u32_e32 v11, vcc, 64, v11 -; GFX9-NEXT: v_addc_co_u32_e64 v12, s[6:7], 0, 0, vcc +; GFX9-NEXT: v_addc_co_u32_e64 v14, s[4:5], 0, 0, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: s_mov_b64 s[6:7], 0x7f +; GFX9-NEXT: s_mov_b64 s[4:5], 0x7f ; GFX9-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v12, v12, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v14, v14, 0, vcc ; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v8, v9 -; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v12, vcc +; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v10, v14, vcc ; GFX9-NEXT: v_mov_b32_e32 v11, 0 ; GFX9-NEXT: v_subb_co_u32_e32 v10, vcc, 0, v11, vcc ; GFX9-NEXT: v_subb_co_u32_e32 v11, vcc, 0, v11, vcc -; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9] +; GFX9-NEXT: v_or_b32_e32 v17, v9, v11 +; GFX9-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc ; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] -; GFX9-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v14, v15, v14, vcc +; GFX9-NEXT: v_or3_b32 v12, v13, v12, v14 +; GFX9-NEXT: v_xor_b32_e32 v18, 1, v12 ; GFX9-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX9-NEXT: v_xor_b32_e32 v13, 0x7f, v8 +; GFX9-NEXT: v_or_b32_e32 v16, v13, v10 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 -; GFX9-NEXT: v_xor_b32_e32 v12, 0x7f, v8 -; GFX9-NEXT: v_or_b32_e32 v13, v9, v11 -; GFX9-NEXT: v_or_b32_e32 v12, v12, v10 -; GFX9-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[12:13] -; GFX9-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GFX9-NEXT: v_cndmask_b32_e64 v15, v3, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v14, v2, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v1, 0, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, s[4:5] -; GFX9-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GFX9-NEXT: s_and_saveexec_b64 s[8:9], s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v15, v3, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v14, v2, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v13, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v12, v0, 0, vcc +; GFX9-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17] +; GFX9-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX9-NEXT: v_and_b32_e32 v16, v18, v16 +; GFX9-NEXT: v_and_b32_e32 v16, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v16 +; GFX9-NEXT: s_and_saveexec_b64 s[8:9], vcc ; GFX9-NEXT: s_cbranch_execz .LBB1_6 ; GFX9-NEXT: ; %bb.1: ; %udiv-bb1 ; GFX9-NEXT: v_add_co_u32_e32 v22, vcc, 1, v8 @@ -1705,43 +1714,43 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v16, v6 ; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v12, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v0 +; GFX9-O0-NEXT: v_mov_b32_e32 v14, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v0 ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v1 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v13, v3 +; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v15, v3 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4 -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v7 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v7 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GFX9-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v17 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill @@ -1751,8 +1760,8 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v17 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill @@ -1761,189 +1770,193 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v15 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O0-NEXT: v_or_b32_e64 v2, v7, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v0 -; GFX9-O0-NEXT: v_or_b32_e64 v0, v4, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v17 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v1 +; GFX9-O0-NEXT: v_or_b32_e64 v2, v8, v9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v16 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v0 +; GFX9-O0-NEXT: v_or_b32_e64 v0, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v2 -; GFX9-O0-NEXT: s_mov_b64 s[6:7], 0 +; GFX9-O0-NEXT: s_mov_b64 s[4:5], 0 ; GFX9-O0-NEXT: ; implicit-def: $vgpr30 : SGPR spill to VGPR lane -; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 0 -; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 1 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[0:1], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v13 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v9 -; GFX9-O0-NEXT: v_or_b32_e64 v14, v3, v1 -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v12 -; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v8, v2, v0 -; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v14 -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], s[6:7] -; GFX9-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 -; GFX9-O0-NEXT: s_mov_b32 s9, 32 -; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s9 +; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 0 +; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 1 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[0:1], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v15 +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v11 +; GFX9-O0-NEXT: v_or_b32_e64 v4, v3, v1 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v14 +; GFX9-O0-NEXT: v_mov_b32_e32 v2, v10 +; GFX9-O0-NEXT: v_or_b32_e64 v10, v2, v0 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[10:11], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[8:9] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[6:7] ; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v6 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v5, v6 -; GFX9-O0-NEXT: s_mov_b32 s8, 0 +; GFX9-O0-NEXT: s_mov_b32 s7, 32 +; GFX9-O0-NEXT: v_add_u32_e64 v6, v6, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v9, v9 +; GFX9-O0-NEXT: v_min_u32_e64 v11, v6, v9 +; GFX9-O0-NEXT: s_mov_b32 s6, 0 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr11 killed $vgpr11 def $vgpr11_vgpr12 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v6 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v5 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v8, v8 +; GFX9-O0-NEXT: v_min_u32_e64 v9, v5, v8 +; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: s_mov_b64 s[8:9], 64 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v9 +; GFX9-O0-NEXT: s_mov_b32 s10, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: s_mov_b32 s12, s9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v9, s[10:11], v8, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, s12 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[10:11], v5, v8, s[10:11] +; GFX9-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: s_mov_b64 s[10:11], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[10:11], v[16:17], s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[10:11] +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v11 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v6, v8, s[10:11] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v11, v5 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v0 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v6, v1 +; GFX9-O0-NEXT: v_min_u32_e64 v8, v5, v6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v2 +; GFX9-O0-NEXT: v_add_u32_e64 v5, v5, s7 +; GFX9-O0-NEXT: v_ffbh_u32_e64 v12, v3 +; GFX9-O0-NEXT: v_min_u32_e64 v16, v5, v12 +; GFX9-O0-NEXT: ; implicit-def: $sgpr7 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s6 +; GFX9-O0-NEXT: ; kill: def $vgpr16 killed $vgpr16 def $vgpr16_vgpr17 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v17, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v12, v16 +; GFX9-O0-NEXT: s_mov_b32 s6, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v17 +; GFX9-O0-NEXT: s_mov_b32 s8, s9 +; GFX9-O0-NEXT: v_add_co_u32_e64 v12, s[6:7], v12, s6 +; GFX9-O0-NEXT: v_mov_b32_e32 v13, s8 +; GFX9-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v5, v13, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v13 +; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[14:15], s[6:7] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[6:7] +; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v12 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v12, v6, v8, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v13, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v12 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 +; GFX9-O0-NEXT: v_mov_b32_e32 v8, v13 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v11 +; GFX9-O0-NEXT: s_mov_b32 s8, s4 +; GFX9-O0-NEXT: s_mov_b32 s9, s5 +; GFX9-O0-NEXT: v_sub_co_u32_e32 v5, vcc, v5, v9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v9, vcc, v6, v8, vcc ; GFX9-O0-NEXT: v_mov_b32_e32 v8, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s8 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v6, v8, vcc +; GFX9-O0-NEXT: v_mov_b32_e32 v10, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, s9 +; GFX9-O0-NEXT: v_subb_co_u32_e32 v10, vcc, v6, v10, vcc +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v4 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v7, v7 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v7 -; GFX9-O0-NEXT: ; implicit-def: $sgpr10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 -; GFX9-O0-NEXT: s_mov_b64 s[10:11], 64 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v14 -; GFX9-O0-NEXT: s_mov_b32 s12, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 -; GFX9-O0-NEXT: s_mov_b32 s14, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v7, s[12:13], v7, s12 -; GFX9-O0-NEXT: v_mov_b32_e32 v8, s14 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[12:13], v4, v8, s[12:13] -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: s_mov_b64 s[12:13], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[10:11], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v9, s[12:13] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v8, v5, v6, s[12:13] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v9 +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v4 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v0 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v5, v1 -; GFX9-O0-NEXT: v_min_u32_e64 v5, v4, v5 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v9, v10 +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[8:9], s[4:5] +; GFX9-O0-NEXT: s_mov_b64 s[10:11], 0x7f +; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[5:6], s[10:11] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[12:13], v[8:9], s[4:5] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[12:13] +; GFX9-O0-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[6:7] +; GFX9-O0-NEXT: v_or3_b32 v7, v4, v7, v10 +; GFX9-O0-NEXT: s_mov_b32 s6, 1 +; GFX9-O0-NEXT: v_xor_b32_e64 v4, v7, s6 +; GFX9-O0-NEXT: v_and_b32_e64 v7, 1, v7 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v7, 1 +; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 +; GFX9-O0-NEXT: s_mov_b32 s12, s11 +; GFX9-O0-NEXT: v_xor_b32_e64 v7, v7, s12 +; GFX9-O0-NEXT: ; kill: def $sgpr10 killed $sgpr10 killed $sgpr10_sgpr11 +; GFX9-O0-NEXT: v_xor_b32_e64 v5, v5, s10 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v6 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v4, v2 -; GFX9-O0-NEXT: v_add_u32_e64 v4, v4, s9 -; GFX9-O0-NEXT: v_ffbh_u32_e64 v10, v3 -; GFX9-O0-NEXT: v_min_u32_e64 v14, v4, v10 -; GFX9-O0-NEXT: ; implicit-def: $sgpr9 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s8 -; GFX9-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v15, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v10, v14 -; GFX9-O0-NEXT: s_mov_b32 s8, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v15 -; GFX9-O0-NEXT: s_mov_b32 s10, s11 -; GFX9-O0-NEXT: v_add_co_u32_e64 v10, s[8:9], v10, s8 -; GFX9-O0-NEXT: v_mov_b32_e32 v11, s10 -; GFX9-O0-NEXT: v_addc_co_u32_e64 v4, s[8:9], v4, v11, s[8:9] -; GFX9-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v11, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v11 -; GFX9-O0-NEXT: s_mov_b64 s[8:9], s[6:7] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[8:9], v[12:13], s[8:9] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v4, v7, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v10, v9 +; GFX9-O0-NEXT: v_or_b32_e64 v7, v7, v10 +; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 killed $vgpr5_vgpr6 killed $exec +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v8 +; GFX9-O0-NEXT: v_or_b32_e64 v5, v5, v6 ; GFX9-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, v8 -; GFX9-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr5_vgpr6 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v9 -; GFX9-O0-NEXT: s_mov_b32 s10, s6 -; GFX9-O0-NEXT: s_mov_b32 s11, s7 -; GFX9-O0-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v7 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v8, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s10 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s10 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v7, vcc, v5, v6, vcc -; GFX9-O0-NEXT: v_mov_b32_e32 v6, s11 -; GFX9-O0-NEXT: v_mov_b32_e32 v5, s11 -; GFX9-O0-NEXT: v_subb_co_u32_e32 v6, vcc, v5, v6, vcc -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v8 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v8, v6 -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_nop 0 -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[7:8], s[6:7] -; GFX9-O0-NEXT: s_mov_b64 s[12:13], 0x7f -; GFX9-O0-NEXT: v_cmp_gt_u64_e64 s[14:15], v[4:5], s[12:13] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[14:15], v[7:8], s[6:7] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[14:15] -; GFX9-O0-NEXT: v_cndmask_b32_e64 v6, v6, v9, s[8:9] -; GFX9-O0-NEXT: v_and_b32_e64 v6, 1, v6 -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[8:9], v6, 1 -; GFX9-O0-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] -; GFX9-O0-NEXT: s_mov_b64 s[4:5], -1 -; GFX9-O0-NEXT: s_xor_b64 s[4:5], s[8:9], s[4:5] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: s_mov_b32 s14, s13 -; GFX9-O0-NEXT: v_xor_b32_e64 v6, v6, s14 -; GFX9-O0-NEXT: ; kill: def $sgpr12 killed $sgpr12 killed $sgpr12_sgpr13 -; GFX9-O0-NEXT: v_xor_b32_e64 v4, v4, s12 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v5 -; GFX9-O0-NEXT: v_mov_b32_e32 v9, v8 -; GFX9-O0-NEXT: v_or_b32_e64 v6, v6, v9 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 killed $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v7 -; GFX9-O0-NEXT: v_or_b32_e64 v4, v4, v5 -; GFX9-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v5, v6 -; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[6:7], v[4:5], s[6:7] -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v1, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 -; GFX9-O0-NEXT: ; implicit-def: $sgpr12 +; GFX9-O0-NEXT: v_mov_b32_e32 v6, v7 +; GFX9-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], s[4:5] +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, v0, v1, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 +; GFX9-O0-NEXT: ; implicit-def: $sgpr10 ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 -; GFX9-O0-NEXT: v_mov_b32_e32 v4, s11 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v4, v3, v4, s[8:9] -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s10 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[8:9] -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 -; GFX9-O0-NEXT: ; implicit-def: $sgpr8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v5 +; GFX9-O0-NEXT: v_mov_b32_e32 v5, s9 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, v3, v5, s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, s8 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v2, v2, v3, s[6:7] +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 +; GFX9-O0-NEXT: ; implicit-def: $sgpr6 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v4 -; GFX9-O0-NEXT: s_and_b64 s[6:7], s[4:5], s[6:7] +; GFX9-O0-NEXT: v_mov_b32_e32 v3, v5 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5] +; GFX9-O0-NEXT: v_and_b32_e64 v4, v4, v5 +; GFX9-O0-NEXT: v_and_b32_e64 v4, 1, v4 +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, 1 ; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill @@ -1953,17 +1966,17 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[4:5], exec ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 2 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 3 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_3 ; GFX9-O0-NEXT: s_branch .LBB1_8 ; GFX9-O0-NEXT: .LBB1_1: ; %Flow -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 4 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 5 @@ -1995,9 +2008,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_5 ; GFX9-O0-NEXT: .LBB1_3: ; %Flow2 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 2 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 3 @@ -2055,9 +2068,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_3 ; GFX9-O0-NEXT: .LBB1_5: ; %Flow1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s4, v30, 6 ; GFX9-O0-NEXT: v_readlane_b32 s5, v30, 7 @@ -2086,9 +2099,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_branch .LBB1_4 ; GFX9-O0-NEXT: .LBB1_6: ; %udiv-do-while ; GFX9-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_readlane_b32 s6, v30, 8 ; GFX9-O0-NEXT: v_readlane_b32 s7, v30, 9 @@ -2280,9 +2293,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_mov_b64 s[6:7], s[4:5] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -2311,9 +2324,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_cbranch_execnz .LBB1_6 ; GFX9-O0-NEXT: s_branch .LBB1_1 ; GFX9-O0-NEXT: .LBB1_7: ; %udiv-preheader -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload @@ -2413,9 +2426,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_writelane_b32 v30, s4, 8 ; GFX9-O0-NEXT: v_writelane_b32 v30, s5, 9 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_nop 0 ; GFX9-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill @@ -2442,9 +2455,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_branch .LBB1_6 ; GFX9-O0-NEXT: .LBB1_8: ; %udiv-bb1 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload @@ -2571,9 +2584,9 @@ define i128 @v_urem_i128_vv(i128 %lhs, i128 %rhs) { ; GFX9-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7] ; GFX9-O0-NEXT: v_writelane_b32 v30, s6, 6 ; GFX9-O0-NEXT: v_writelane_b32 v30, s7, 7 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[18:19], -1 +; GFX9-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; GFX9-O0-NEXT: buffer_store_dword v30, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[18:19] +; GFX9-O0-NEXT: s_mov_b64 exec, s[16:17] ; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_5 ; GFX9-O0-NEXT: s_branch .LBB1_7 diff --git a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll index 04eea20993608..b36651c50ee0a 100644 --- a/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll +++ b/llvm/test/CodeGen/AMDGPU/repeated-divisor.ll @@ -311,16 +311,21 @@ define <2 x double> @v_repeat_divisor_f64_x2_arcp(double %x, double %y, double % ; GFX6-NEXT: v_div_scale_f64 v[6:7], s[4:5], v[4:5], v[4:5], 1.0 ; GFX6-NEXT: v_rcp_f64_e32 v[8:9], v[6:7] ; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc ; GFX6-NEXT: v_fma_f64 v[10:11], -v[6:7], v[8:9], 1.0 ; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[10:11], v[8:9] ; GFX6-NEXT: v_div_scale_f64 v[10:11], s[4:5], 1.0, v[4:5], 1.0 ; GFX6-NEXT: v_fma_f64 v[12:13], -v[6:7], v[8:9], 1.0 ; GFX6-NEXT: s_mov_b32 s4, 0x3ff00000 ; GFX6-NEXT: v_fma_f64 v[8:9], v[8:9], v[12:13], v[8:9] -; GFX6-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v11 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, s4, v11 ; GFX6-NEXT: v_mul_f64 v[12:13], v[10:11], v[8:9] -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc ; GFX6-NEXT: v_fma_f64 v[6:7], -v[6:7], v[12:13], v[10:11] +; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v10, v10, v14 +; GFX6-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX6-NEXT: s_nop 3 ; GFX6-NEXT: v_div_fmas_f64 v[6:7], v[6:7], v[8:9], v[12:13] ; GFX6-NEXT: v_div_fixup_f64 v[4:5], v[6:7], v[4:5], 1.0 ; GFX6-NEXT: v_mul_f64 v[0:1], v[0:1], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll index b78cbb0ac29cf..24b3f1af7917f 100644 --- a/llvm/test/CodeGen/AMDGPU/rsq.f64.ll +++ b/llvm/test/CodeGen/AMDGPU/rsq.f64.ll @@ -40,17 +40,22 @@ define amdgpu_ps <2 x i32> @s_rsq_f64(double inreg %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v7 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 @@ -220,17 +225,22 @@ define amdgpu_ps <2 x i32> @s_rsq_f64_fabs(double inreg %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 @@ -401,17 +411,22 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_f64(double inreg %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v7 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 ; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 @@ -581,17 +596,22 @@ define amdgpu_ps <2 x i32> @s_neg_rsq_neg_f64(double inreg %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 ; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 @@ -763,17 +783,22 @@ define double @v_rsq_f64(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -932,17 +957,22 @@ define double @v_rsq_f64_fabs(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1102,17 +1132,22 @@ define double @v_rsq_f64_missing_contract0(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1271,17 +1306,22 @@ define double @v_rsq_f64_missing_contract1(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1440,17 +1480,22 @@ define double @v_neg_rsq_f64(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -1625,35 +1670,44 @@ define <2 x double> @v_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] -; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[10:11], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] -; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[16:17], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[16:17], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[16:17], v[6:7], v[16:17] +; SI-SDAG-NEXT: v_xor_b32_e32 v18, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] +; SI-SDAG-NEXT: s_nop 2 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[4:5], v[12:13] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v17 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-SDAG-NEXT: v_xor_b32_e32 v8, v9, v8 +; SI-SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], 1.0 -; SI-SDAG-NEXT: s_nop 0 -; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9] +; SI-SDAG-NEXT: s_nop 2 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -1925,35 +1979,44 @@ define <2 x double> @v_neg_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], -1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] -; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], -1.0 +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[10:11], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] -; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], -1.0, v[2:3], -1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v19 -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_rcp_f64_e32 v[16:17], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v11 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[16:17], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[16:17], v[6:7], v[16:17] +; SI-SDAG-NEXT: v_xor_b32_e32 v18, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], -1.0, v[2:3], -1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] +; SI-SDAG-NEXT: s_nop 2 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[4:5], v[12:13] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v17 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-SDAG-NEXT: v_xor_b32_e32 v8, v9, v8 +; SI-SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 -; SI-SDAG-NEXT: s_nop 0 -; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9] +; SI-SDAG-NEXT: s_nop 2 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], -1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2209,17 +2272,22 @@ define <2 x double> @v_neg_rsq_v2f64_poisonelt(<2 x double> %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 ; SI-SDAG-NEXT: v_mov_b32_e32 v2, 0 @@ -2437,7 +2505,6 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: v_ldexp_f64 v[2:3], v[2:3], v4 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[6:7], v[0:1] ; SI-SDAG-NEXT: v_rsq_f64_e32 v[4:5], v[2:3] -; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 ; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[0:1], v[6:7] ; SI-SDAG-NEXT: v_mul_f64 v[6:7], v[6:7], 0.5 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[2:3], v[4:5] @@ -2467,36 +2534,46 @@ define <2 x double> @v_neg_pos_rsq_v2f64(<2 x double> %x) { ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[10:11], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v14, vcc ; SI-SDAG-NEXT: v_ldexp_f64 v[4:5], v[4:5], v8 -; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[6:7], v[12:13], 1.0 +; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v15 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; SI-SDAG-NEXT: v_fma_f64 v[8:9], v[12:13], v[8:9], v[12:13] -; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[2:3], v[2:3], 1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[6:7], v[8:9], 1.0 -; SI-SDAG-NEXT: v_div_scale_f64 v[12:13], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[8:9], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_rcp_f64_e32 v[8:9], v[10:11] -; SI-SDAG-NEXT: v_mul_f64 v[14:15], v[12:13], v[4:5] +; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[2:3], v[2:3], 1.0 +; SI-SDAG-NEXT: v_mul_f64 v[12:13], v[10:11], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v7 -; SI-SDAG-NEXT: v_fma_f64 v[16:17], -v[6:7], v[14:15], v[12:13] -; SI-SDAG-NEXT: v_fma_f64 v[18:19], -v[10:11], v[8:9], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[8:9], v[18:19], v[8:9] -; SI-SDAG-NEXT: v_div_scale_f64 v[18:19], s[4:5], 1.0, v[2:3], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[10:11], v[6:7], 1.0 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v13 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[18:19], v[6:7] +; SI-SDAG-NEXT: v_rcp_f64_e32 v[16:17], v[8:9] +; SI-SDAG-NEXT: v_fma_f64 v[14:15], -v[6:7], v[12:13], v[10:11] +; SI-SDAG-NEXT: s_mov_b32 s4, 0xbff00000 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[8:9], v[16:17], 1.0 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v11 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[16:17], v[6:7], v[16:17] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_xor_b32_e32 v18, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[8:9], v[6:7], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[16:17], s[4:5], 1.0, v[2:3], 1.0 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_and_b32_e32 v18, 1, v18 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v18 +; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[16:17], v[6:7] ; SI-SDAG-NEXT: s_mov_b32 s4, 0x3ff00000 -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[16:17], v[4:5], v[14:15] -; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[10:11], v[8:9], v[18:19] -; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v11 -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s4, v19 -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: s_nop 1 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[14:15], v[4:5], v[12:13] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v9 +; SI-SDAG-NEXT: v_fma_f64 v[12:13], -v[8:9], v[10:11], v[16:17] +; SI-SDAG-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s4, v17 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; SI-SDAG-NEXT: v_xor_b32_e32 v8, v9, v8 +; SI-SDAG-NEXT: v_and_b32_e32 v8, 1, v8 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[0:1], -1.0 -; SI-SDAG-NEXT: s_nop 0 -; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[8:9] +; SI-SDAG-NEXT: s_nop 2 +; SI-SDAG-NEXT: v_div_fmas_f64 v[6:7], v[12:13], v[6:7], v[10:11] ; SI-SDAG-NEXT: v_div_fixup_f64 v[2:3], v[6:7], v[2:3], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -2754,17 +2831,22 @@ define double @v_rsq_f64_fneg_fabs(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -2925,17 +3007,22 @@ define double @v_rsq_f64__afn_sqrt(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4113,17 +4200,22 @@ define double @v_rsq_f64__nnan_ninf(double %x) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4812,17 +4904,22 @@ define double @v_rsq_amdgcn_sqrt_f64(double %x) { ; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0x3ff00000 ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4893,17 +4990,22 @@ define double @v_neg_rsq_amdgcn_sqrt_f64(double %x) { ; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], v[0:1] ; SI-SDAG-NEXT: s_mov_b32 s6, 0xbff00000 ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], -1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], -1.0, v[0:1], -1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s6, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s6, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], -1.0 ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] @@ -4973,17 +5075,22 @@ define amdgpu_ps <2 x i32> @s_rsq_amdgcn_sqrt_f64(double inreg %x) { ; SI-SDAG-NEXT: v_sqrt_f64_e32 v[0:1], s[0:1] ; SI-SDAG-NEXT: s_mov_b32 s2, 0x3ff00000 ; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[0:1], v[0:1], v[0:1], 1.0 +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[0:1], 1.0, v[0:1], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s2, v7 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v10, 1, v10 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[0:1], s2, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[0:1], vcc ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: s_nop 1 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], 1.0 ; SI-SDAG-NEXT: v_readfirstlane_b32 s0, v0 @@ -5087,18 +5194,23 @@ define double @v_div_contract_sqrt_f64(double %x, double %y) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 -; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_xor_b32_e32 v12, v13, v12 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v12, 1, v12 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-SDAG-NEXT: s_nop 1 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5254,18 +5366,23 @@ define double @v_div_arcp_sqrt_f64(double %x, double %y) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 -; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_xor_b32_e32 v12, v13, v12 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v12, 1, v12 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-SDAG-NEXT: s_nop 1 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5421,18 +5538,23 @@ define double @v_div_contract_arcp_sqrt_f64(double %x, double %y) { ; SI-SDAG-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc ; SI-SDAG-NEXT: v_div_scale_f64 v[4:5], s[4:5], v[2:3], v[2:3], v[0:1] +; SI-SDAG-NEXT: v_div_scale_f64 v[10:11], s[4:5], v[0:1], v[2:3], v[0:1] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[6:7], v[4:5] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v3, v5 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v11 ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] -; SI-SDAG-NEXT: v_div_scale_f64 v[8:9], s[4:5], v[0:1], v[2:3], v[0:1] -; SI-SDAG-NEXT: v_fma_f64 v[10:11], -v[4:5], v[6:7], 1.0 -; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[10:11], v[6:7] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], v1, v9 -; SI-SDAG-NEXT: v_mul_f64 v[10:11], v[8:9], v[6:7] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc -; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[10:11], v[8:9] -; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[10:11] +; SI-SDAG-NEXT: v_xor_b32_e32 v12, v13, v12 +; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[4:5], v[6:7], 1.0 +; SI-SDAG-NEXT: v_and_b32_e32 v12, 1, v12 +; SI-SDAG-NEXT: v_fma_f64 v[6:7], v[6:7], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[10:11], v[6:7] +; SI-SDAG-NEXT: v_fma_f64 v[4:5], -v[4:5], v[8:9], v[10:11] +; SI-SDAG-NEXT: s_nop 1 +; SI-SDAG-NEXT: v_div_fmas_f64 v[4:5], v[4:5], v[6:7], v[8:9] ; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[4:5], v[2:3], v[0:1] ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; @@ -5573,8 +5695,8 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-SDAG-NEXT: v_mov_b32_e32 v8, 0xffffff80 ; SI-SDAG-NEXT: v_rsq_f64_e32 v[2:3], v[0:1] ; SI-SDAG-NEXT: v_mov_b32_e32 v9, 0x260 -; SI-SDAG-NEXT: s_mov_b32 s6, 0 -; SI-SDAG-NEXT: s_mov_b32 s7, 0x40700000 +; SI-SDAG-NEXT: s_mov_b32 s4, 0 +; SI-SDAG-NEXT: s_mov_b32 s5, 0x40700000 ; SI-SDAG-NEXT: v_mul_f64 v[4:5], v[0:1], v[2:3] ; SI-SDAG-NEXT: v_mul_f64 v[2:3], v[2:3], 0.5 ; SI-SDAG-NEXT: s_mov_b32 s8, 0x40700000 @@ -5590,20 +5712,25 @@ define double @v_div_const_contract_sqrt_f64(double %x) { ; SI-SDAG-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v9 ; SI-SDAG-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; SI-SDAG-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[4:5], v[0:1], v[0:1], s[6:7] +; SI-SDAG-NEXT: v_div_scale_f64 v[2:3], s[6:7], v[0:1], v[0:1], s[4:5] ; SI-SDAG-NEXT: v_rcp_f64_e32 v[4:5], v[2:3] ; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, v1, v3 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc ; SI-SDAG-NEXT: v_fma_f64 v[6:7], -v[2:3], v[4:5], 1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[6:7], v[4:5] -; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[4:5], s[6:7], v[0:1], s[6:7] +; SI-SDAG-NEXT: v_div_scale_f64 v[6:7], s[6:7], s[4:5], v[0:1], s[4:5] ; SI-SDAG-NEXT: v_fma_f64 v[8:9], -v[2:3], v[4:5], 1.0 ; SI-SDAG-NEXT: v_fma_f64 v[4:5], v[4:5], v[8:9], v[4:5] -; SI-SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], s8, v7 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, s8, v7 +; SI-SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc ; SI-SDAG-NEXT: v_mul_f64 v[8:9], v[6:7], v[4:5] -; SI-SDAG-NEXT: s_xor_b64 vcc, s[4:5], vcc +; SI-SDAG-NEXT: v_xor_b32_e32 v10, v11, v10 ; SI-SDAG-NEXT: v_fma_f64 v[2:3], -v[2:3], v[8:9], v[6:7] +; SI-SDAG-NEXT: v_and_b32_e32 v6, 1, v10 +; SI-SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; SI-SDAG-NEXT: s_nop 3 ; SI-SDAG-NEXT: v_div_fmas_f64 v[2:3], v[2:3], v[4:5], v[8:9] -; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[6:7] +; SI-SDAG-NEXT: v_div_fixup_f64 v[0:1], v[2:3], v[0:1], s[4:5] ; SI-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; SI-GISEL-LABEL: v_div_const_contract_sqrt_f64: diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll index 4177179b31c06..2495993339587 100644 --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -20,18 +20,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: s_add_u32 s10, s2, s8 -; SI-NEXT: s_addc_u32 s11, s3, s9 -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] -; SI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[8:9], 0 ; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_add_u32 s0, s2, s8 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: v_mov_b32_e32 v1, s11 -; SI-NEXT: v_add_i32_e32 v0, vcc, s10, v0 +; SI-NEXT: s_addc_u32 s1, s3, s9 +; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; SI-NEXT: s_lshr_b32 s2, s9, 31 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v0, s2, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_add_i32_e32 v0, vcc, s0, v0 ; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -41,18 +42,19 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v1, s2 -; VI-NEXT: s_add_u32 s6, s2, s4 -; VI-NEXT: v_mov_b32_e32 v2, s3 -; VI-NEXT: s_addc_u32 s7, s3, s5 -; VI-NEXT: v_cmp_lt_i64_e64 s[8:9], s[4:5], 0 -; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[6:7], v[1:2] ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s2, s4 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 -; VI-NEXT: s_xor_b64 s[0:1], s[8:9], vcc -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_add_u32_e32 v2, vcc, s6, v2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s1, s3, s5 +; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[2:3] +; VI-NEXT: s_lshr_b32 s2, s5, 31 +; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v2, s2, v2 +; VI-NEXT: v_and_b32_e32 v2, 1, v2 +; VI-NEXT: v_mov_b32_e32 v3, s1 +; VI-NEXT: v_add_u32_e32 v2, vcc, s0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -67,11 +69,12 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX9-NEXT: s_add_u32 s4, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_addc_u32 s5, s3, s7 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[8:9], s[6:7], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] +; GFX9-NEXT: s_lshr_b32 s2, s7, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: s_xor_b64 s[2:3], s[8:9], vcc -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s4, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -86,10 +89,11 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s4, s2, s6 ; GFX10-NEXT: s_addc_u32 s5, s3, s7 -; GFX10-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 ; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] -; GFX10-NEXT: s_xor_b32 s2, s6, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s7, 31 +; GFX10-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: v_add_co_u32 v0, s2, s4, v0 ; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s2, s5, 0, s2 ; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[0:1] @@ -104,11 +108,14 @@ define amdgpu_kernel void @saddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s6, s2, s4 ; GFX11-NEXT: s_addc_u32 s7, s3, s5 -; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 -; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] -; GFX11-NEXT: s_xor_b32 s2, s4, s2 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[6:7], s[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX11-NEXT: s_lshr_b32 s2, s5, 31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_xor_b32_e32 v0, s2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: v_add_co_u32 v0, s2, s6, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s7, 0, s2 @@ -132,21 +139,20 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 -; SI-NEXT: s_add_i32 s12, s8, s9 -; SI-NEXT: s_cmp_lt_i32 s9, 0 -; SI-NEXT: s_cselect_b64 s[10:11], -1, 0 -; SI-NEXT: s_cmp_lt_i32 s12, s8 -; SI-NEXT: s_mov_b32 s5, s1 +; SI-NEXT: s_lshr_b32 s10, s9, 31 +; SI-NEXT: s_add_i32 s11, s8, s9 +; SI-NEXT: s_cmp_lt_i32 s11, s8 ; SI-NEXT: s_cselect_b64 s[8:9], -1, 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 -; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 -; SI-NEXT: s_xor_b64 s[4:5], s[10:11], s[8:9] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; SI-NEXT: v_xor_b32_e32 v0, s10, v0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_mov_b32 s0, s2 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s6 ; SI-NEXT: s_mov_b32 s3, s7 -; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_mov_b32_e32 v1, s11 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -155,19 +161,19 @@ define amdgpu_kernel void @s_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_lshr_b32 s2, s5, 31 +; VI-NEXT: s_add_i32 s3, s4, s5 +; VI-NEXT: s_cmp_lt_i32 s3, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: s_add_i32 s6, s4, s5 -; VI-NEXT: s_cmp_lt_i32 s5, 0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_cmp_lt_i32 s6, s4 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 -; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] -; VI-NEXT: flat_store_dword v[0:1], v4 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, s[0:1] +; VI-NEXT: v_xor_b32_e32 v4, s2, v4 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: flat_store_dword v[0:1], v5 +; VI-NEXT: v_and_b32_e32 v0, 1, v4 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -250,10 +256,11 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_mov_b32 s5, s3 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v2, vcc, v0, v1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; SI-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v0 -; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 +; SI-NEXT: v_lshrrev_b32_e32 v1, 31, v1 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dword v2, off, s[8:11], 0 ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -274,11 +281,12 @@ define amdgpu_kernel void @v_saddo_i32(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v6, vcc, v4, v5 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v5 -; VI-NEXT: v_cmp_lt_i32_e64 s[0:1], v6, v4 -; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] +; VI-NEXT: v_cmp_lt_i32_e32 vcc, v6, v4 +; VI-NEXT: v_lshrrev_b32_e32 v5, 31, v5 +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v4, v5, v4 ; VI-NEXT: flat_store_dword v[0:1], v6 -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_and_b32_e32 v0, 1, v4 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -355,19 +363,20 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_addc_u32 s13, s5, s7 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] -; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 +; SI-NEXT: s_lshr_b32 s4, s7, 31 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v2, s4, v0 ; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 ; SI-NEXT: v_mov_b32_e32 v1, s13 -; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 ; SI-NEXT: s_mov_b32 s1, s3 ; SI-NEXT: s_mov_b32 s2, s10 ; SI-NEXT: s_mov_b32 s3, s11 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_waitcnt expcnt(0) -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; SI-NEXT: v_and_b32_e32 v0, 1, v2 ; SI-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -381,15 +390,16 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, s7 ; VI-NEXT: v_mov_b32_e32 v5, s5 -; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[4:5] -; VI-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0 +; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; VI-NEXT: s_lshr_b32 s2, s7, 31 +; VI-NEXT: v_xor_b32_e32 v6, s2, v4 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 -; VI-NEXT: s_xor_b64 s[0:1], s[2:3], vcc +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[4:5] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_and_b32_e32 v0, 1, v6 ; VI-NEXT: flat_store_byte v[2:3], v0 ; VI-NEXT: s_endpgm ; @@ -402,31 +412,33 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: v_mov_b32_e32 v0, s12 ; GFX9-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-NEXT: s_addc_u32 s1, s13, s15 -; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[14:15], 0 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] +; GFX9-NEXT: s_lshr_b32 s2, s15, 31 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_xor_b32_e32 v3, s2, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-NEXT: s_xor_b64 s[0:1], s[2:3], vcc ; GFX9-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX9-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX9-NEXT: global_store_byte v2, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; ; GFX10-LABEL: s_saddo_i64: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s12, s14 ; GFX10-NEXT: s_addc_u32 s1, s13, s15 -; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[14:15], 0 -; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[0:1], s[12:13] +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[0:1], s[12:13] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 +; GFX10-NEXT: s_lshr_b32 s2, s15, 31 +; GFX10-NEXT: v_xor_b32_e32 v2, s2, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: s_xor_b32 s0, s2, s3 -; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s0 -; GFX10-NEXT: global_store_dwordx2 v2, v[0:1], s[8:9] -; GFX10-NEXT: global_store_byte v2, v3, s[10:11] +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: global_store_dwordx2 v3, v[0:1], s[8:9] +; GFX10-NEXT: global_store_byte v3, v2, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: s_saddo_i64: @@ -435,16 +447,19 @@ define amdgpu_kernel void @s_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_add_u32 s8, s4, s6 ; GFX11-NEXT: s_addc_u32 s9, s5, s7 -; GFX11-NEXT: v_cmp_lt_i64_e64 s6, s[6:7], 0 +; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[8:9], s[4:5] -; GFX11-NEXT: v_mov_b32_e32 v0, s8 -; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s9 -; GFX11-NEXT: s_xor_b32 s4, s6, s4 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s4 +; GFX11-NEXT: s_lshr_b32 s4, s7, 31 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_xor_b32_e32 v2, s4, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] -; GFX11-NEXT: global_store_b8 v2, v3, s[2:3] +; GFX11-NEXT: global_store_b64 v3, v[0:1], s[0:1] +; GFX11-NEXT: global_store_b8 v3, v2, s[2:3] ; GFX11-NEXT: s_endpgm %sadd = call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %a, i64 %b) nounwind %val = extractvalue { i64, i1 } %sadd, 0 @@ -478,11 +493,12 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; SI-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc -; SI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; SI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; SI-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; SI-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 ; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 -; SI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; SI-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -503,11 +519,12 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v9, vcc, v1, v3, vcc -; VI-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; VI-NEXT: v_cmp_lt_i64_e64 s[0:1], v[8:9], v[0:1] +; VI-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v0, v2, v0 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] -; VI-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; VI-NEXT: flat_store_byte v[6:7], v0 ; VI-NEXT: s_endpgm ; @@ -521,11 +538,12 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc -; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[2:3] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[4:5], v[0:1] +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] +; GFX9-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX9-NEXT: global_store_byte v6, v0, s[10:11] ; GFX9-NEXT: s_endpgm ; @@ -540,34 +558,36 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX10-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX10-NEXT: s_xor_b32 s0, vcc_lo, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: global_store_dwordx2 v6, v[4:5], s[8:9] ; GFX10-NEXT: global_store_byte v6, v0, s[10:11] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: v_saddo_i64: ; GFX11: ; %bb.0: -; GFX11-NEXT: s_load_b256 s[4:11], s[4:5], 0x24 +; GFX11-NEXT: s_load_b256 s[0:7], s[4:5], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v6, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_load_b64 v[0:1], v6, s[8:9] -; GFX11-NEXT: global_load_b64 v[2:3], v6, s[10:11] +; GFX11-NEXT: global_load_b64 v[0:1], v6, s[4:5] +; GFX11-NEXT: global_load_b64 v[2:3], v6, s[6:7] ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3] -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1] -; GFX11-NEXT: s_xor_b32 s0, vcc_lo, s0 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: global_store_b64 v6, v[4:5], s[4:5] -; GFX11-NEXT: global_store_b8 v6, v0, s[6:7] +; GFX11-NEXT: global_store_b64 v6, v[4:5], s[0:1] +; GFX11-NEXT: global_store_b8 v6, v0, s[2:3] ; GFX11-NEXT: s_endpgm %a = load i64, ptr addrspace(1) %aptr, align 4 %b = load i64, ptr addrspace(1) %bptr, align 4 @@ -582,37 +602,39 @@ define amdgpu_kernel void @v_saddo_i64(ptr addrspace(1) %out, ptr addrspace(1) % define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %carryout, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) nounwind { ; SI-LABEL: v_saddo_v2i32: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 -; SI-NEXT: s_mov_b32 s11, 0xf000 -; SI-NEXT: s_mov_b32 s10, -1 -; SI-NEXT: s_mov_b32 s14, s10 -; SI-NEXT: s_mov_b32 s15, s11 +; SI-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_mov_b32 s14, s2 +; SI-NEXT: s_mov_b32 s15, s3 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s4 -; SI-NEXT: s_mov_b32 s13, s5 -; SI-NEXT: s_mov_b32 s4, s6 -; SI-NEXT: s_mov_b32 s5, s7 -; SI-NEXT: s_mov_b32 s6, s10 -; SI-NEXT: s_mov_b32 s7, s11 +; SI-NEXT: s_mov_b32 s12, s8 +; SI-NEXT: s_mov_b32 s13, s9 +; SI-NEXT: s_mov_b32 s8, s10 +; SI-NEXT: s_mov_b32 s9, s11 +; SI-NEXT: s_mov_b32 s10, s2 +; SI-NEXT: s_mov_b32 s11, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 -; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[4:7], 0 -; SI-NEXT: s_mov_b32 s8, s0 -; SI-NEXT: s_mov_b32 s9, s1 -; SI-NEXT: s_mov_b32 s12, s2 -; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: buffer_load_dwordx2 v[2:3], off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s0, s4 +; SI-NEXT: s_mov_b32 s1, s5 +; SI-NEXT: s_mov_b32 s8, s6 +; SI-NEXT: s_mov_b32 s9, s7 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_add_i32_e32 v5, vcc, v1, v3 ; SI-NEXT: v_add_i32_e32 v4, vcc, v0, v2 -; SI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 -; SI-NEXT: v_cmp_lt_i32_e64 s[4:5], v5, v1 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; SI-NEXT: v_cmp_lt_i32_e64 s[2:3], v4, v0 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; SI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] -; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[8:11], 0 -; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[12:15], 0 +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_cmp_lt_i32_e32 vcc, v5, v1 +; SI-NEXT: v_lshrrev_b32_e32 v2, 31, v2 +; SI-NEXT: v_lshrrev_b32_e32 v3, 31, v3 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_xor_b32_e32 v1, v3, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 +; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: v_saddo_v2i32: @@ -632,14 +654,16 @@ define amdgpu_kernel void @v_saddo_v2i32(ptr addrspace(1) %out, ptr addrspace(1) ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v9, vcc, v1, v3 ; VI-NEXT: v_add_u32_e32 v8, vcc, v0, v2 -; VI-NEXT: v_cmp_gt_i32_e64 s[0:1], 0, v3 -; VI-NEXT: v_cmp_lt_i32_e64 s[4:5], v9, v1 -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; VI-NEXT: v_cmp_lt_i32_e64 s[2:3], v8, v0 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[4:5] -; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; VI-NEXT: s_xor_b64 s[0:1], vcc, s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; VI-NEXT: v_cmp_lt_i32_e32 vcc, v8, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_cmp_lt_i32_e32 vcc, v9, v1 +; VI-NEXT: v_lshrrev_b32_e32 v2, 31, v2 +; VI-NEXT: v_lshrrev_b32_e32 v3, 31, v3 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v0, v2, v0 +; VI-NEXT: v_xor_b32_e32 v1, v3, v1 +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[8:9] ; VI-NEXT: flat_store_dwordx2 v[6:7], v[0:1] ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/saddsat.ll b/llvm/test/CodeGen/AMDGPU/saddsat.ll index 4e27cf20d3c98..da1631f9fe903 100644 --- a/llvm/test/CodeGen/AMDGPU/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/saddsat.ll @@ -82,11 +82,15 @@ define i16 @v_saddsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i16: @@ -120,25 +124,31 @@ define i32 @v_saddsat_i32(i32 %lhs, i32 %rhs) { ; GFX6-LABEL: v_saddsat_i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX6-NEXT: v_add_i32_e64 v1, s[4:5], v0, v1 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX6-NEXT: v_add_i32_e32 v1, vcc, v0, v1 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX8-NEXT: v_add_u32_e64 v1, s[4:5], v0, v1 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 31, v1 +; GFX8-NEXT: v_add_u32_e32 v1, vcc, v0, v1 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_i32: @@ -183,19 +193,27 @@ define <2 x i16> @v_saddsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v4, v3, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_add_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -246,25 +264,37 @@ define <3 x i16> @v_saddsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, 0xffff8000, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v3, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -323,37 +353,53 @@ define <2 x float> @v_saddsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_add_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, 0xffff8000, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_add_u16_e32 v2, v0, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_add_u16_e32 v5, v4, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cmp_gt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 15, v5 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, 0xffff8000, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX8-NEXT: v_cmp_gt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_add_u16_e32 v3, v1, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v3, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -379,39 +425,51 @@ define <2 x i32> @v_saddsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6-LABEL: v_saddsat_v2i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v0, v2 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX6-NEXT: v_add_i32_e64 v2, s[4:5], v1, v3 -; GFX6-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v4, 31, v2 +; GFX6-NEXT: v_add_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX6-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GFX6-NEXT: v_add_i32_e32 v3, vcc, v1, v3 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_saddsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v0, v2 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_add_u32_e64 v2, s[4:5], v1, v3 -; GFX8-NEXT: v_cmp_gt_i32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v4, 31, v2 +; GFX8-NEXT: v_add_u32_e32 v2, vcc, v0, v2 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc +; GFX8-NEXT: v_lshrrev_b32_e32 v2, 31, v3 +; GFX8-NEXT: v_add_u32_e32 v3, vcc, v1, v3 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_saddsat_v2i32: @@ -438,11 +496,14 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_add_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v5 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -452,11 +513,14 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_add_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v5 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -466,11 +530,14 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3] -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -479,13 +546,16 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_saddsat_i64: @@ -493,12 +563,15 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 31, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll b/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll index 9c1060ee089f0..ee3f4fb2246b9 100644 --- a/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll +++ b/llvm/test/CodeGen/AMDGPU/scalar-branch-missing-and-exec.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx600 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx700 -verify-machineinstrs < %s | FileCheck %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx802 -verify-machineinstrs < %s | FileCheck %s @@ -52,3 +53,5 @@ bb50: ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll index a166c4f93462d..00d9a1bce53d5 100644 --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -142,7 +142,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd -; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i32 s4, s3, 31 ; GCN-IR-NEXT: s_mov_b32 s5, s4 @@ -153,23 +152,32 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_subb_u32 s13, s3, s4 ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[8:9], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s2, s2, s6 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[12:13], 0 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s6 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[12:13], 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[10:11] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[8:9] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[10:11] +; GCN-IR-NEXT: v_readfirstlane_b32 s10, v1 +; GCN-IR-NEXT: v_readfirstlane_b32 s11, v0 +; GCN-IR-NEXT: s_or_b32 s15, s10, s11 ; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[12:13] ; GCN-IR-NEXT: s_sub_u32 s16, s14, s20 ; GCN-IR-NEXT: s_subb_u32 s17, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[18:19], s[16:17], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[22:23], s[16:17], 63 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[10:11], s[18:19] -; GCN-IR-NEXT: s_and_b64 s[10:11], s[18:19], exec +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[16:17], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[16:17], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[10:11] +; GCN-IR-NEXT: v_readfirstlane_b32 s10, v0 +; GCN-IR-NEXT: s_or_b32 s15, s15, s10 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19] +; GCN-IR-NEXT: v_or_b32_e32 v0, s15, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s15, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 ; GCN-IR-NEXT: s_cselect_b32 s11, 0, s13 ; GCN-IR-NEXT: s_cselect_b32 s10, 0, s12 -; GCN-IR-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-IR-NEXT: s_mov_b64 s[8:9], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] +; GCN-IR-NEXT: s_mov_b32 s15, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s16, 1 @@ -366,29 +374,37 @@ define i64 @v_test_sdiv(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v1, v3, v13 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v13 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v13, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7] +; GCN-IR-NEXT: v_mov_b32_e32 v14, v12 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v15, v13 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v3, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v6 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[6:7], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v7 ; GCN-IR-NEXT: v_min_u32_e32 v11, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[6:7], v10, v11 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[6:7] -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v10, v11 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_xor_b32_e32 v8, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v14, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v13 -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v7, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v6, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v2 @@ -1231,16 +1247,22 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_sub_u32 s2, s2, s4 ; GCN-IR-NEXT: s_subb_u32 s3, s3, s4 ; GCN-IR-NEXT: s_flbit_i32_b64 s14, s[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 ; GCN-IR-NEXT: s_add_u32 s10, s14, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s11, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[10:11], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[10:11], 63 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[8:9], s[12:13] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[12:13], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[8:9], s[10:11], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[10:11], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-IR-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-IR-NEXT: s_or_b32 s9, s8, s9 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[12:13] +; GCN-IR-NEXT: v_or_b32_e32 v0, s9, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s9, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[16:17] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 @@ -1418,19 +1440,26 @@ define i64 @v_test_sdiv_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: s_movk_i32 s4, 0xffc5 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v10 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1611,20 +1640,27 @@ define i64 @v_test_sdiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: s_movk_i32 s4, 0xffd0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v10 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v7, 0, vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1710,21 +1746,28 @@ define i64 @v_test_sdiv_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v0, v10 ; GCN-IR-NEXT: v_subb_u32_e32 v5, vcc, v1, v10, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v0, v4 -; GCN-IR-NEXT: v_add_i32_e64 v0, s[4:5], 32, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 32, v0 ; GCN-IR-NEXT: v_ffbh_u32_e32 v1, v5 ; GCN-IR-NEXT: v_min_u32_e32 v8, v0, v1 -; GCN-IR-NEXT: v_sub_i32_e64 v0, s[4:5], 48, v8 -; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, s[4:5] -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[0:1] +; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, 48, v8 +; GCN-IR-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v11, v10 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[0:1] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v5, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v4, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v0 diff --git a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll index 5eb3ae8d9a8fd..3596922dfed3a 100644 --- a/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll +++ b/llvm/test/CodeGen/AMDGPU/select-constant-cttz.ll @@ -8,24 +8,27 @@ define amdgpu_kernel void @select_constant_cttz(ptr addrspace(1) noalias %out, p ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_load_dword s2, s[2:3], 0x0 +; GCN-NEXT: s_load_dword s4, s[2:3], 0x0 ; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_lshr_b32 s4, 1, s2 -; GCN-NEXT: s_cmp_lg_u32 s2, 0 -; GCN-NEXT: s_ff1_i32_b32 s2, s4 +; GCN-NEXT: s_lshr_b32 s5, 1, s4 +; GCN-NEXT: s_cmp_lg_u32 s4, 0 +; GCN-NEXT: s_ff1_i32_b32 s6, s5 ; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GCN-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s2, -1, s2 -; GCN-NEXT: s_flbit_i32 s6, s2 -; GCN-NEXT: s_sub_i32 s8, 31, s6 -; GCN-NEXT: s_cmp_eq_u32 s2, 0 -; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 -; GCN-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GCN-NEXT: s_and_b64 s[4:5], s[4:5], exec -; GCN-NEXT: s_cselect_b32 s4, -1, s8 -; GCN-NEXT: s_mov_b32 s2, -1 -; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: s_cselect_b32 s4, -1, s6 +; GCN-NEXT: s_flbit_i32 s5, s4 +; GCN-NEXT: s_sub_i32 s5, 31, s5 +; GCN-NEXT: s_cmp_eq_u32 s4, 0 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GCN-NEXT: v_or_b32_e32 v0, v0, v2 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, v1, -1, vcc ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: s_endpgm %v = load i32, ptr addrspace(1) %arrayidx, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/select-opt.ll b/llvm/test/CodeGen/AMDGPU/select-opt.ll index 9ef384fb73051..0e379bd4cc08a 100644 --- a/llvm/test/CodeGen/AMDGPU/select-opt.ll +++ b/llvm/test/CodeGen/AMDGPU/select-opt.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s ; Make sure to test with f32 and i32 compares. If we have to use float @@ -15,6 +16,28 @@ ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; GCN: buffer_store_dword [[VRESULT]] define amdgpu_kernel void @opt_select_i32_and_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { +; GCN-LABEL: opt_select_i32_and_cmp_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dword s8, s[4:5], 0xf +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, s2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[8:9] +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GCN-NEXT: v_and_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %and = and i1 %icmp0, %icmp1 @@ -32,6 +55,28 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_i32(ptr addrspace(1) %out, i32 ; GCN: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; GCN: buffer_store_dword [[VRESULT]] define amdgpu_kernel void @opt_select_i32_and_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { +; GCN-LABEL: opt_select_i32_and_cmp_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xf +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GCN-NEXT: s_endpgm %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %and = and i1 %fcmp0, %fcmp1 @@ -53,6 +98,30 @@ define amdgpu_kernel void @opt_select_i32_and_cmp_f32(ptr addrspace(1) %out, flo ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_and_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { +; GCN-LABEL: opt_select_i64_and_cmp_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s8, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, s10 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GCN-NEXT: v_and_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %and = and i1 %icmp0, %icmp1 @@ -72,6 +141,30 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_i32(ptr addrspace(1) %out, i32 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_and_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { +; GCN-LABEL: opt_select_i64_and_cmp_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s8, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_and_b32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %and = and i1 %fcmp0, %fcmp1 @@ -92,6 +185,28 @@ define amdgpu_kernel void @opt_select_i64_and_cmp_f32(ptr addrspace(1) %out, flo ; GCN: buffer_store_dword [[VRESULT]] ; GCN: s_endpgm define amdgpu_kernel void @opt_select_i32_or_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %x, i32 %y) #0 { +; GCN-LABEL: opt_select_i32_or_cmp_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dword s8, s[4:5], 0xf +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s0, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s8 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s0, s2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[8:9] +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %or = or i1 %icmp0, %icmp1 @@ -109,6 +224,28 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_i32(ptr addrspace(1) %out, i32 ; GCN-DAG: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[RESULT]] ; GCN: buffer_store_dword [[VRESULT]] define amdgpu_kernel void @opt_select_i32_or_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i32 %x, i32 %y) #0 { +; GCN-LABEL: opt_select_i32_or_cmp_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x9 +; GCN-NEXT: s_load_dword s4, s[4:5], 0xf +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s1 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s0, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s0, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s4 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[8:11], 0 +; GCN-NEXT: s_endpgm %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %or = or i1 %fcmp0, %fcmp1 @@ -130,6 +267,30 @@ define amdgpu_kernel void @opt_select_i32_or_cmp_f32(ptr addrspace(1) %out, floa ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_or_cmp_i32(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i64 %x, i64 %y) #0 { +; GCN-LABEL: opt_select_i64_or_cmp_i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_lg_u32 s8, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s15 +; GCN-NEXT: v_mov_b32_e32 v1, s13 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: s_cmp_lg_u32 s8, s10 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] +; GCN-NEXT: v_or_b32_e32 v2, v2, v3 +; GCN-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm %icmp0 = icmp ne i32 %a, %b %icmp1 = icmp ne i32 %a, %c %or = or i1 %icmp0, %icmp1 @@ -149,6 +310,30 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_i32(ptr addrspace(1) %out, i32 ; GCN-DAG: v_mov_b32_e32 v[[VRESULT0:[0-9]+]], [[RESULT1]] ; GCN: buffer_store_dwordx2 v[[[VRESULT0]]:[[VRESULT1]]] define amdgpu_kernel void @opt_select_i64_or_cmp_f32(ptr addrspace(1) %out, float %a, float %b, float %c, i64 %x, i64 %y) #0 { +; GCN-LABEL: opt_select_i64_or_cmp_f32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s9 +; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s8, v1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s8, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GCN-NEXT: v_mov_b32_e32 v2, s15 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s14 +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm %fcmp0 = fcmp one float %a, %b %fcmp1 = fcmp one float %a, %c %or = or i1 %fcmp0, %fcmp1 @@ -161,6 +346,26 @@ define amdgpu_kernel void @opt_select_i64_or_cmp_f32(ptr addrspace(1) %out, floa ; GCN: v_cmp_neq_f32_e64 s{{\[[0-9]+:[0-9]+\]}}, s{{[0-9]+}}, 1.0 define amdgpu_kernel void @regression(ptr addrspace(1) %out, float %c0, float %c1) #0 { +; GCN-LABEL: regression: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dword s0, s[4:5], 0xb +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_cmp_neq_f32_e64 s[0:1], s0, 1.0 +; GCN-NEXT: s_and_b64 vcc, exec, s[0:1] +; GCN-NEXT: s_cbranch_vccnz .LBB8_2 +; GCN-NEXT: ; %bb.1: ; %if0 +; GCN-NEXT: s_mov_b64 s[0:1], 0 +; GCN-NEXT: s_branch .LBB8_3 +; GCN-NEXT: .LBB8_2: +; GCN-NEXT: s_mov_b64 s[0:1], -1 +; GCN-NEXT: .LBB8_3: ; %endif +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 4.0, s[0:1] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm entry: %cmp0 = fcmp oeq float %c0, 1.0 br i1 %cmp0, label %if0, label %endif diff --git a/llvm/test/CodeGen/AMDGPU/setcc.ll b/llvm/test/CodeGen/AMDGPU/setcc.ll index cc82f532fc477..f8a22ec1161e4 100644 --- a/llvm/test/CodeGen/AMDGPU/setcc.ll +++ b/llvm/test/CodeGen/AMDGPU/setcc.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn-- -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=GCN -check-prefix=FUNC %s ; RUN: llc -mtriple=r600-- -mcpu=redwood -verify-machineinstrs < %s | FileCheck -allow-deprecated-dag-overlap -check-prefix=R600 -check-prefix=FUNC %s @@ -10,6 +11,33 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; GCN: s_cmp_eq_u32 ; GCN: s_cmp_eq_u32 define amdgpu_kernel void @setcc_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x i32> %b) #0 { +; GCN-LABEL: setcc_v2i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0xb +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s4, s6 +; GCN-NEXT: s_cselect_b64 s[8:9], -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s5, s7 +; GCN-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[8:9] +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: setcc_v2i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETE_INT * T0.Y, KC0[3].X, KC0[3].Z, +; R600-NEXT: SETE_INT * T0.X, KC0[2].W, KC0[3].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) %result = icmp eq <2 x i32> %a, %b %sext = sext <2 x i1> %result to <2 x i32> store <2 x i32> %sext, ptr addrspace(1) %out @@ -27,6 +55,49 @@ define amdgpu_kernel void @setcc_v2i32(ptr addrspace(1) %out, <2 x i32> %a, <2 x ; GCN: s_cmp_eq_u32 ; GCN: s_cmp_eq_u32 define amdgpu_kernel void @setcc_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 { +; GCN-LABEL: setcc_v4i32: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx8 s[0:7], s[10:11], 0x0 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_eq_u32 s0, s4 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s1, s5 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s2, s6 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[0:1] +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: s_cmp_eq_u32 s3, s7 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, s[0:1] +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, -1, s[0:1] +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[12:13] +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: setcc_v4i32: +; R600: ; %bb.0: +; R600-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; R600-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: MOV * T0.X, KC0[2].Z, +; R600-NEXT: ALU clause starting at 11: +; R600-NEXT: SETE_INT * T0.W, T0.W, T1.W, +; R600-NEXT: SETE_INT * T0.Z, T0.Z, T1.Z, +; R600-NEXT: SETE_INT * T0.Y, T0.Y, T1.Y, +; R600-NEXT: SETE_INT T0.X, T0.X, T1.X, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -44,6 +115,30 @@ define amdgpu_kernel void @setcc_v4i32(ptr addrspace(1) %out, ptr addrspace(1) % ; R600: SETE_DX10 ; GCN: v_cmp_eq_f32 define amdgpu_kernel void @f32_oeq(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_oeq: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_oeq: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETE_DX10 * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp oeq float %a, %b %1 = sext i1 %0 to i32 @@ -55,6 +150,30 @@ entry: ; R600: SETGT_DX10 ; GCN: v_cmp_gt_f32 define amdgpu_kernel void @f32_ogt(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ogt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_gt_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ogt: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGT_DX10 * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ogt float %a, %b %1 = sext i1 %0 to i32 @@ -66,6 +185,30 @@ entry: ; R600: SETGE_DX10 ; GCN: v_cmp_ge_f32 define amdgpu_kernel void @f32_oge(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_oge: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_ge_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_oge: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGE_DX10 * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp oge float %a, %b %1 = sext i1 %0 to i32 @@ -77,6 +220,30 @@ entry: ; R600: SETGT_DX10 ; GCN: v_cmp_lt_f32 define amdgpu_kernel void @f32_olt(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_olt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_lt_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_olt: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGT_DX10 * T1.X, KC0[2].W, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp olt float %a, %b %1 = sext i1 %0 to i32 @@ -88,6 +255,30 @@ entry: ; R600: SETGE_DX10 ; GCN: v_cmp_le_f32 define amdgpu_kernel void @f32_ole(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ole: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_le_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ole: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGE_DX10 * T1.X, KC0[2].W, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ole float %a, %b %1 = sext i1 %0 to i32 @@ -104,6 +295,33 @@ entry: ; GCN: v_cmp_lg_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define amdgpu_kernel void @f32_one(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_one: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_lg_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_one: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT_DX10 T0.W, KC0[2].Z, KC0[2].W, +; R600-NEXT: SETGT_DX10 * T1.W, KC0[2].W, KC0[2].Z, +; R600-NEXT: OR_INT * T0.W, PV.W, PS, +; R600-NEXT: SETNE_INT T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp one float %a, %b %1 = sext i1 %0 to i32 @@ -118,6 +336,33 @@ entry: ; R600-DAG: SETNE_INT ; GCN: v_cmp_o_f32 define amdgpu_kernel void @f32_ord(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ord: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_o_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ord: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETE_DX10 T0.W, KC0[2].Z, KC0[2].Z, +; R600-NEXT: SETE_DX10 * T1.W, KC0[2].W, KC0[2].W, +; R600-NEXT: AND_INT * T0.W, PV.W, PS, +; R600-NEXT: SETNE_INT T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ord float %a, %b %1 = sext i1 %0 to i32 @@ -134,6 +379,33 @@ entry: ; GCN: v_cmp_nlg_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define amdgpu_kernel void @f32_ueq(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ueq: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_nlg_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ueq: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT_DX10 T0.W, KC0[2].Z, KC0[2].W, +; R600-NEXT: SETGT_DX10 * T1.W, KC0[2].W, KC0[2].Z, +; R600-NEXT: OR_INT * T0.W, PV.W, PS, +; R600-NEXT: SETE_INT T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ueq float %a, %b %1 = sext i1 %0 to i32 @@ -147,6 +419,31 @@ entry: ; GCN: v_cmp_nle_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define amdgpu_kernel void @f32_ugt(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ugt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_nle_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ugt: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGE * T0.W, KC0[2].W, KC0[2].Z, +; R600-NEXT: SETE_DX10 T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ugt float %a, %b %1 = sext i1 %0 to i32 @@ -161,6 +458,31 @@ entry: ; GCN: v_cmp_nlt_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define amdgpu_kernel void @f32_uge(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_uge: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_nlt_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_uge: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT * T0.W, KC0[2].W, KC0[2].Z, +; R600-NEXT: SETE_DX10 T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp uge float %a, %b %1 = sext i1 %0 to i32 @@ -175,6 +497,31 @@ entry: ; GCN: v_cmp_nge_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define amdgpu_kernel void @f32_ult(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ult: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_nge_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ult: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGE * T0.W, KC0[2].Z, KC0[2].W, +; R600-NEXT: SETE_DX10 T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ult float %a, %b %1 = sext i1 %0 to i32 @@ -189,6 +536,31 @@ entry: ; GCN: v_cmp_ngt_f32_e32 vcc ; GCN-NEXT: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, vcc define amdgpu_kernel void @f32_ule(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_ule: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_ngt_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_ule: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETGT * T0.W, KC0[2].Z, KC0[2].W, +; R600-NEXT: SETE_DX10 T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp ule float %a, %b %1 = sext i1 %0 to i32 @@ -200,6 +572,30 @@ entry: ; R600: SETNE_DX10 ; GCN: v_cmp_neq_f32 define amdgpu_kernel void @f32_une(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_une: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_neq_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_une: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETNE_DX10 * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp une float %a, %b %1 = sext i1 %0 to i32 @@ -214,6 +610,33 @@ entry: ; R600: SETNE_INT ; GCN: v_cmp_u_f32 define amdgpu_kernel void @f32_uno(ptr addrspace(1) %out, float %a, float %b) #0 { +; GCN-LABEL: f32_uno: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_cmp_u_f32_e32 vcc, s2, v0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: f32_uno: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: SETNE_DX10 T0.W, KC0[2].Z, KC0[2].Z, +; R600-NEXT: SETNE_DX10 * T1.W, KC0[2].W, KC0[2].W, +; R600-NEXT: OR_INT * T0.W, PV.W, PS, +; R600-NEXT: SETNE_INT T0.X, PV.W, 0.0, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = fcmp uno float %a, %b %1 = sext i1 %0 to i32 @@ -229,6 +652,30 @@ entry: ; R600: SETE_INT ; GCN: s_cmp_eq_u32 define amdgpu_kernel void @i32_eq(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_eq: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_eq_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_eq: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETE_INT * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp eq i32 %a, %b %1 = sext i1 %0 to i32 @@ -240,6 +687,30 @@ entry: ; R600: SETNE_INT ; GCN: s_cmp_lg_u32 define amdgpu_kernel void @i32_ne(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_ne: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_lg_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_ne: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETNE_INT * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp ne i32 %a, %b %1 = sext i1 %0 to i32 @@ -251,6 +722,30 @@ entry: ; R600: SETGT_UINT ; GCN: s_cmp_gt_u32 define amdgpu_kernel void @i32_ugt(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_ugt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_gt_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_ugt: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGT_UINT * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp ugt i32 %a, %b %1 = sext i1 %0 to i32 @@ -262,6 +757,30 @@ entry: ; R600: SETGE_UINT ; GCN: s_cmp_ge_u32 define amdgpu_kernel void @i32_uge(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_uge: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_ge_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_uge: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGE_UINT * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp uge i32 %a, %b %1 = sext i1 %0 to i32 @@ -273,6 +792,30 @@ entry: ; R600: SETGT_UINT ; GCN: s_cmp_lt_u32 define amdgpu_kernel void @i32_ult(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_ult: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_lt_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_ult: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGT_UINT * T1.X, KC0[2].W, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp ult i32 %a, %b %1 = sext i1 %0 to i32 @@ -284,6 +827,30 @@ entry: ; R600: SETGE_UINT ; GCN: s_cmp_le_u32 define amdgpu_kernel void @i32_ule(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_ule: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_le_u32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_ule: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGE_UINT * T1.X, KC0[2].W, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp ule i32 %a, %b %1 = sext i1 %0 to i32 @@ -295,6 +862,30 @@ entry: ; R600: SETGT_INT ; GCN: s_cmp_gt_i32 define amdgpu_kernel void @i32_sgt(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_sgt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_gt_i32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_sgt: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGT_INT * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp sgt i32 %a, %b %1 = sext i1 %0 to i32 @@ -306,6 +897,30 @@ entry: ; R600: SETGE_INT ; GCN: s_cmp_ge_i32 define amdgpu_kernel void @i32_sge(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_sge: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_ge_i32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_sge: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGE_INT * T1.X, KC0[2].Z, KC0[2].W, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp sge i32 %a, %b %1 = sext i1 %0 to i32 @@ -317,6 +932,30 @@ entry: ; R600: SETGT_INT ; GCN: s_cmp_lt_i32 define amdgpu_kernel void @i32_slt(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_slt: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_lt_i32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_slt: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGT_INT * T1.X, KC0[2].W, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp slt i32 %a, %b %1 = sext i1 %0 to i32 @@ -328,6 +967,30 @@ entry: ; R600: SETGE_INT ; GCN: s_cmp_le_i32 define amdgpu_kernel void @i32_sle(ptr addrspace(1) %out, i32 %a, i32 %b) #0 { +; GCN-LABEL: i32_sle: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9 +; GCN-NEXT: s_mov_b32 s7, 0xf000 +; GCN-NEXT: s_mov_b32 s6, -1 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b32 s4, s0 +; GCN-NEXT: s_mov_b32 s5, s1 +; GCN-NEXT: s_cmp_le_i32 s2, s3 +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; GCN-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: i32_sle: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; R600-NEXT: SETGE_INT * T1.X, KC0[2].W, KC0[2].Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) entry: %0 = icmp sle i32 %a, %b %1 = sext i1 %0 to i32 @@ -345,6 +1008,59 @@ entry: ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, ; GCN: s_endpgm define amdgpu_kernel void @v3i32_eq(ptr addrspace(1) %out, ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) #0 { +; GCN-LABEL: v3i32_eq: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v7, 4, v0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 +; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: buffer_load_dwordx4 v[0:3], v[7:8], s[8:11], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: buffer_load_dwordx4 v[3:6], v[7:8], s[0:3], 0 addr64 +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v1, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v5 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc +; GCN-NEXT: buffer_store_dword v2, v[7:8], s[4:7], 0 addr64 offset:8 +; GCN-NEXT: buffer_store_dwordx2 v[0:1], v[7:8], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: v3i32_eq: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 9, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 +; R600-NEXT: VTX_READ_128 T2.XYZW, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: LSHL * T0.W, T0.X, literal.x, +; R600-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; R600-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, +; R600-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: SETE_INT * T0.Y, T2.Y, T1.Y, +; R600-NEXT: SETE_INT T0.X, T2.X, T1.X, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; R600-NEXT: LSHR T1.X, PV.W, literal.x, +; R600-NEXT: SETE_INT * T2.X, T2.Z, T1.Z, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: ADD_INT * T0.W, T0.W, literal.x, +; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.a = getelementptr <3 x i32>, ptr addrspace(1) %ptra, i32 %tid %gep.b = getelementptr <3 x i32>, ptr addrspace(1) %ptrb, i32 %tid @@ -366,6 +1082,100 @@ define amdgpu_kernel void @v3i32_eq(ptr addrspace(1) %out, ptr addrspace(1) %ptr ; GCN-DAG: v_cndmask_b32_e64 {{v[0-9]+}}, 0, -1, ; GCN: s_endpgm define amdgpu_kernel void @v3i8_eq(ptr addrspace(1) %out, ptr addrspace(1) %ptra, ptr addrspace(1) %ptrb) #0 { +; GCN-LABEL: v3i8_eq: +; GCN: ; %bb.0: +; GCN-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x9 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xd +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_mov_b32 s10, 0 +; GCN-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_mov_b64 s[8:9], s[6:7] +; GCN-NEXT: buffer_load_dword v2, v[0:1], s[8:11], 0 addr64 +; GCN-NEXT: v_mov_b32_e32 v3, 0xff +; GCN-NEXT: s_mov_b64 s[2:3], s[10:11] +; GCN-NEXT: s_mov_b64 s[6:7], s[10:11] +; GCN-NEXT: buffer_load_dword v4, v[0:1], s[0:3], 0 addr64 +; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: v_and_b32_e32 v5, 0xff, v2 +; GCN-NEXT: v_bfe_u32 v6, v2, 8, 8 +; GCN-NEXT: v_bfe_u32 v2, v2, 16, 8 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_and_b32_e32 v7, 0xff, v4 +; GCN-NEXT: v_bfe_u32 v8, v4, 8, 8 +; GCN-NEXT: v_bfe_u32 v4, v4, 16, 8 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v5, v7 +; GCN-NEXT: v_cndmask_b32_e32 v5, 0, v3, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v6, v8 +; GCN-NEXT: v_cndmask_b32_e64 v6, 0, -1, vcc +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, v2, v4 +; GCN-NEXT: v_cndmask_b32_e32 v2, 0, v3, vcc +; GCN-NEXT: v_lshlrev_b32_e32 v3, 8, v6 +; GCN-NEXT: v_or_b32_e32 v3, v5, v3 +; GCN-NEXT: buffer_store_byte v2, v[0:1], s[4:7], 0 addr64 offset:2 +; GCN-NEXT: buffer_store_short v3, v[0:1], s[4:7], 0 addr64 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: v3i8_eq: +; R600: ; %bb.0: +; R600-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @6 +; R600-NEXT: ALU 41, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT MSKOR T8.XW, T9.X +; R600-NEXT: MEM_RAT MSKOR T7.XW, T0.X +; R600-NEXT: CF_END +; R600-NEXT: Fetch clause starting at 6: +; R600-NEXT: VTX_READ_32 T7.X, T7.X, 0, #1 +; R600-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; R600-NEXT: ALU clause starting at 10: +; R600-NEXT: LSHL * T0.W, T0.X, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: ADD_INT T0.X, KC0[2].W, PV.W, +; R600-NEXT: ADD_INT * T7.X, KC0[2].Z, PV.W, +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: MOV * T1.W, literal.x, +; R600-NEXT: 8(1.121039e-44), 0(0.000000e+00) +; R600-NEXT: AND_INT T0.Y, T0.X, literal.x, +; R600-NEXT: BFE_UINT T0.Z, T0.X, literal.y, PV.W, +; R600-NEXT: BFE_UINT T2.W, T7.X, literal.y, PV.W, BS:VEC_120/SCL_212 +; R600-NEXT: AND_INT * T3.W, T7.X, literal.x, +; R600-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; R600-NEXT: SETE_INT T2.W, PV.W, PV.Z, +; R600-NEXT: SETE_INT * T3.W, PS, PV.Y, +; R600-NEXT: AND_INT T0.Z, PS, literal.x, +; R600-NEXT: LSHL T2.W, PV.W, literal.y, +; R600-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; R600-NEXT: 255(3.573311e-43), 8(1.121039e-44) +; R600-NEXT: AND_INT T3.W, PS, literal.x, +; R600-NEXT: OR_INT * T2.W, PV.Z, PV.W, +; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; R600-NEXT: AND_INT T2.W, PS, literal.x, +; R600-NEXT: LSHL * T3.W, PV.W, literal.y, +; R600-NEXT: 65535(9.183409e-41), 3(4.203895e-45) +; R600-NEXT: LSHL T8.X, PV.W, PS, +; R600-NEXT: LSHL * T8.W, literal.x, PS, +; R600-NEXT: 65535(9.183409e-41), 0(0.000000e+00) +; R600-NEXT: MOV T8.Y, 0.0, +; R600-NEXT: BFE_UINT T0.Z, T0.X, literal.x, T1.W, BS:VEC_021/SCL_122 +; R600-NEXT: BFE_UINT T1.W, T7.X, literal.x, T1.W, BS:VEC_120/SCL_212 +; R600-NEXT: ADD_INT * T2.W, T0.W, literal.y, +; R600-NEXT: 16(2.242078e-44), 2(2.802597e-45) +; R600-NEXT: AND_INT T3.W, PS, literal.x, +; R600-NEXT: SETE_INT * T1.W, PV.W, PV.Z, +; R600-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; R600-NEXT: AND_INT T1.W, PS, literal.x, +; R600-NEXT: LSHL * T3.W, PV.W, literal.y, +; R600-NEXT: 255(3.573311e-43), 3(4.203895e-45) +; R600-NEXT: LSHL T7.X, PV.W, PS, +; R600-NEXT: LSHL * T7.W, literal.x, PS, +; R600-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; R600-NEXT: MOV T7.Y, 0.0, +; R600-NEXT: MOV T8.Z, 0.0, +; R600-NEXT: MOV * T7.Z, 0.0, +; R600-NEXT: LSHR T0.X, T2.W, literal.x, +; R600-NEXT: LSHR * T9.X, T0.W, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %gep.a = getelementptr <3 x i8>, ptr addrspace(1) %ptra, i32 %tid %gep.b = getelementptr <3 x i8>, ptr addrspace(1) %ptrb, i32 %tid @@ -415,6 +1225,73 @@ bb2: ; GCN: s_cmp_gt_i32 ; GCN: s_cmp_gt_i32 define amdgpu_kernel void @setcc_v2i32_expand( +; GCN-LABEL: setcc_v2i32_expand: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx8 s[0:7], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s11, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s10, -1 +; GCN-NEXT: s_mov_b32 s8, s6 +; GCN-NEXT: s_mov_b32 s9, s7 +; GCN-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 +; GCN-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_gt_i32 s1, 1 +; GCN-NEXT: s_cselect_b64 s[6:7], -1, 0 +; GCN-NEXT: s_cmp_gt_i32 s0, 1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; GCN-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GCN-NEXT: s_cmp_lt_i32 s5, 0x47800001 +; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GCN-NEXT: v_xor_b32_e32 v0, s3, v0 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: s_cmp_lt_i32 s4, 0x47800001 +; GCN-NEXT: v_xor_b32_e32 v2, s2, v1 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v0, vcc +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v2, vcc +; GCN-NEXT: buffer_store_dwordx2 v[0:1], off, s[8:11], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: setcc_v2i32_expand: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @8 +; R600-NEXT: ALU 1, @15, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @10 +; R600-NEXT: ALU 14, @17, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 8: +; R600-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; R600-NEXT: Fetch clause starting at 10: +; R600-NEXT: VTX_READ_64 T2.XY, T2.X, 0, #1 +; R600-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: MOV * T0.X, KC0[2].Y, +; R600-NEXT: ALU clause starting at 15: +; R600-NEXT: MOV T1.X, KC0[2].Z, +; R600-NEXT: MOV * T2.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 17: +; R600-NEXT: SETGT_INT * T0.W, T0.Y, 1, +; R600-NEXT: CNDE_INT T0.W, PV.W, 0.0, literal.x, +; R600-NEXT: SETGT_INT * T1.W, T0.X, 1, +; R600-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00) +; R600-NEXT: CNDE_INT T0.Z, PS, 0.0, literal.x, +; R600-NEXT: XOR_INT T0.W, PV.W, T1.Y, +; R600-NEXT: SETGT_INT * T1.W, T2.Y, literal.y, +; R600-NEXT: -2147483648(-0.000000e+00), 1199570944(6.553600e+04) +; R600-NEXT: CNDE_INT T0.Y, PS, PV.W, literal.x, +; R600-NEXT: SETGT_INT T0.W, T2.X, literal.y, +; R600-NEXT: XOR_INT * T1.W, PV.Z, T1.X, +; R600-NEXT: 1065353216(1.000000e+00), 1199570944(6.553600e+04) +; R600-NEXT: CNDE_INT T0.X, PV.W, PS, literal.x, +; R600-NEXT: LSHR * T1.X, KC0[3].X, literal.y, +; R600-NEXT: 1065353216(1.000000e+00), 2(2.802597e-45) ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, @@ -442,6 +1319,103 @@ entry: ; GCN: s_cmp_gt_i32 ; GCN: s_cmp_gt_i32 define amdgpu_kernel void @setcc_v4i32_expand( +; GCN-LABEL: setcc_v4i32_expand: +; GCN: ; %bb.0: ; %entry +; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x9 +; GCN-NEXT: s_mov_b32 s3, 0xf000 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_load_dwordx4 s[12:15], s[4:5], 0x0 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: s_mov_b32 s0, s10 +; GCN-NEXT: s_mov_b32 s1, s11 +; GCN-NEXT: s_load_dwordx4 s[4:7], s[6:7], 0x0 +; GCN-NEXT: s_load_dwordx4 s[8:11], s[8:9], 0x0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_cmp_gt_i32 s15, 1 +; GCN-NEXT: s_cselect_b64 s[16:17], -1, 0 +; GCN-NEXT: s_cmp_gt_i32 s14, 1 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[16:17] +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_gt_i32 s13, 1 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[14:15] +; GCN-NEXT: v_lshlrev_b32_e32 v0, 31, v0 +; GCN-NEXT: s_cselect_b64 s[14:15], -1, 0 +; GCN-NEXT: s_cmp_gt_i32 s12, 1 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[14:15] +; GCN-NEXT: v_lshlrev_b32_e32 v1, 31, v1 +; GCN-NEXT: v_xor_b32_e32 v0, s7, v0 +; GCN-NEXT: s_cselect_b64 s[12:13], -1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[12:13] +; GCN-NEXT: v_lshlrev_b32_e32 v2, 31, v2 +; GCN-NEXT: v_xor_b32_e32 v1, s6, v1 +; GCN-NEXT: s_cmp_lt_i32 s11, 0x47800001 +; GCN-NEXT: v_lshlrev_b32_e32 v4, 31, v3 +; GCN-NEXT: v_xor_b32_e32 v5, s5, v2 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v3, 1.0, v0, vcc +; GCN-NEXT: s_cmp_lt_i32 s10, 0x47800001 +; GCN-NEXT: v_xor_b32_e32 v0, s4, v4 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v2, 1.0, v1, vcc +; GCN-NEXT: s_cmp_lt_i32 s9, 0x47800001 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v1, 1.0, v5, vcc +; GCN-NEXT: s_cmp_lt_i32 s8, 0x47800001 +; GCN-NEXT: s_cselect_b64 vcc, -1, 0 +; GCN-NEXT: v_cndmask_b32_e32 v0, 1.0, v0, vcc +; GCN-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; GCN-NEXT: s_endpgm +; +; R600-LABEL: setcc_v4i32_expand: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 0, @14, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 0 @8 +; R600-NEXT: ALU 1, @15, KC0[CB0:0-32], KC1[] +; R600-NEXT: TEX 1 @10 +; R600-NEXT: ALU 28, @17, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.XYZW, T0.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: PAD +; R600-NEXT: Fetch clause starting at 8: +; R600-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; R600-NEXT: Fetch clause starting at 10: +; R600-NEXT: VTX_READ_128 T2.XYZW, T2.X, 0, #1 +; R600-NEXT: VTX_READ_128 T1.XYZW, T1.X, 0, #1 +; R600-NEXT: ALU clause starting at 14: +; R600-NEXT: MOV * T0.X, KC0[2].Y, +; R600-NEXT: ALU clause starting at 15: +; R600-NEXT: MOV T1.X, KC0[2].Z, +; R600-NEXT: MOV * T2.X, KC0[2].W, +; R600-NEXT: ALU clause starting at 17: +; R600-NEXT: SETGT_INT T0.W, T0.W, 1, +; R600-NEXT: SETGT_INT * T3.W, T0.Z, 1, +; R600-NEXT: CNDE_INT * T0.W, PV.W, 0.0, literal.x, +; R600-NEXT: -2147483648(-0.000000e+00), 0(0.000000e+00) +; R600-NEXT: XOR_INT T3.Y, PV.W, T1.W, +; R600-NEXT: SETGT_INT T0.Z, T2.W, literal.x, +; R600-NEXT: CNDE_INT T0.W, T3.W, 0.0, literal.y, BS:VEC_201 +; R600-NEXT: SETGT_INT * T1.W, T0.Y, 1, +; R600-NEXT: 1199570944(6.553600e+04), -2147483648(-0.000000e+00) +; R600-NEXT: CNDE_INT T3.X, PS, 0.0, literal.x, +; R600-NEXT: SETGT_INT T0.Y, T2.Z, literal.y, +; R600-NEXT: XOR_INT T1.Z, PV.W, T1.Z, +; R600-NEXT: SETGT_INT T0.W, T0.X, 1, +; R600-NEXT: CNDE_INT * T1.W, PV.Z, PV.Y, literal.z, +; R600-NEXT: -2147483648(-0.000000e+00), 1199570944(6.553600e+04) +; R600-NEXT: 1065353216(1.000000e+00), 0(0.000000e+00) +; R600-NEXT: CNDE_INT T3.Y, PV.W, 0.0, literal.x, +; R600-NEXT: CNDE_INT T1.Z, PV.Y, PV.Z, literal.y, +; R600-NEXT: SETGT_INT T0.W, T2.Y, literal.z, +; R600-NEXT: XOR_INT * T2.W, PV.X, T1.Y, +; R600-NEXT: -2147483648(-0.000000e+00), 1065353216(1.000000e+00) +; R600-NEXT: 1199570944(6.553600e+04), 0(0.000000e+00) +; R600-NEXT: CNDE_INT T1.Y, PV.W, PS, literal.x, +; R600-NEXT: SETGT_INT T0.W, T2.X, literal.y, +; R600-NEXT: XOR_INT * T2.W, PV.Y, T1.X, +; R600-NEXT: 1065353216(1.000000e+00), 1199570944(6.553600e+04) +; R600-NEXT: CNDE_INT T1.X, PV.W, PS, literal.x, +; R600-NEXT: LSHR * T0.X, KC0[3].X, literal.y, +; R600-NEXT: 1065353216(1.000000e+00), 2(2.802597e-45) ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, @@ -464,3 +1438,5 @@ entry: } attributes #0 = { nounwind "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FUNC: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll index 65a17ed67481c..499d38855b7c6 100644 --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -294,15 +294,19 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshr_b64 v[16:17], v[0:1], v16 ; GCN-NEXT: v_lshl_b64 v[18:19], v[2:3], v8 -; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v9, v16, v9 +; GCN-NEXT: v_subrev_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v9 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_and_b32_e32 v9, 1, v9 +; GCN-NEXT: v_lshl_b64 v[16:17], v[0:1], v16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc ; GCN-NEXT: v_cndmask_b32_e64 v2, v9, v2, s[4:5] @@ -312,14 +316,18 @@ define <2 x i128> @v_shl_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshl_b64 v[16:17], v[6:7], v12 ; GCN-NEXT: v_cndmask_b32_e64 v3, v11, v3, s[4:5] ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 +; GCN-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v12 ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 +; GCN-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] ; GCN-NEXT: v_lshl_b64 v[9:10], v[4:5], v9 +; GCN-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GCN-NEXT: v_and_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] ; GCN-NEXT: v_lshl_b64 v[0:1], v[0:1], v8 @@ -343,15 +351,19 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v9, v16, v9 +; GCN-NEXT: v_subrev_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_and_b32_e32 v9, 1, v9 +; GCN-NEXT: v_lshr_b64 v[16:17], v[2:3], v16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] @@ -361,14 +373,18 @@ define <2 x i128> @v_lshr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 ; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 +; GCN-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v12 ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 +; GCN-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] ; GCN-NEXT: v_lshr_b64 v[9:10], v[6:7], v9 +; GCN-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GCN-NEXT: v_and_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] ; GCN-NEXT: v_lshr_b64 v[2:3], v[2:3], v8 @@ -392,15 +408,19 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_sub_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_lshl_b64 v[16:17], v[2:3], v16 ; GCN-NEXT: v_lshr_b64 v[18:19], v[0:1], v8 -; GCN-NEXT: v_cmp_gt_u64_e32 vcc, 64, v[8:9] -; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] +; GCN-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11] +; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[8:9] +; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v11, v9, v11 -; GCN-NEXT: v_subrev_i32_e64 v9, s[6:7], 64, v8 +; GCN-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5] +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v9, v16, v9 +; GCN-NEXT: v_subrev_i32_e32 v16, vcc, 64, v8 ; GCN-NEXT: v_or_b32_e32 v19, v19, v17 -; GCN-NEXT: v_or_b32_e32 v18, v18, v16 ; GCN-NEXT: v_or_b32_e32 v10, v8, v10 -; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v9 -; GCN-NEXT: s_and_b64 vcc, s[4:5], vcc +; GCN-NEXT: v_and_b32_e32 v9, 1, v9 +; GCN-NEXT: v_ashr_i64 v[16:17], v[2:3], v16 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 ; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11] ; GCN-NEXT: v_cndmask_b32_e32 v9, v16, v18, vcc ; GCN-NEXT: v_cndmask_b32_e64 v0, v9, v0, s[4:5] @@ -410,14 +430,18 @@ define <2 x i128> @v_ashr_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_lshr_b64 v[16:17], v[4:5], v12 ; GCN-NEXT: v_cndmask_b32_e64 v1, v11, v1, s[4:5] ; GCN-NEXT: v_or_b32_e32 v16, v16, v9 +; GCN-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v12 ; GCN-NEXT: v_cmp_gt_u64_e64 s[4:5], 64, v[12:13] -; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] -; GCN-NEXT: v_subrev_i32_e64 v9, s[8:9], 64, v12 ; GCN-NEXT: v_or_b32_e32 v11, v17, v10 +; GCN-NEXT: v_cndmask_b32_e64 v17, 0, 1, s[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15] ; GCN-NEXT: v_ashr_i64 v[9:10], v[6:7], v9 +; GCN-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5] +; GCN-NEXT: v_and_b32_e32 v17, v18, v17 +; GCN-NEXT: v_and_b32_e32 v17, 1, v17 ; GCN-NEXT: v_or_b32_e32 v15, v13, v15 ; GCN-NEXT: v_or_b32_e32 v14, v12, v14 -; GCN-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5] +; GCN-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v17 ; GCN-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15] ; GCN-NEXT: v_cndmask_b32_e64 v9, v9, v16, s[4:5] ; GCN-NEXT: v_cndmask_b32_e64 v4, v9, v4, s[6:7] @@ -448,62 +472,66 @@ define amdgpu_kernel void @s_shl_v2i128ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-NEXT: s_sub_i32 s22, 64, s8 -; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshr_b64 s[22:23], s[0:1], s22 -; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: s_sub_i32 s20, 64, s12 +; GCN-NEXT: s_lshl_b64 s[16:17], s[6:7], s12 +; GCN-NEXT: s_lshr_b64 s[20:21], s[4:5], s20 +; GCN-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] +; GCN-NEXT: v_cmp_lt_u64_e64 s[20:21], s[12:13], 64 +; GCN-NEXT: s_sub_i32 s18, s12, 64 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[20:21] +; GCN-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 0 +; GCN-NEXT: s_lshl_b64 s[18:19], s[4:5], s18 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[20:21] +; GCN-NEXT: v_readfirstlane_b32 s20, v1 +; GCN-NEXT: v_readfirstlane_b32 s21, v0 +; GCN-NEXT: s_and_b32 s20, s20, s21 +; GCN-NEXT: s_and_b32 s20, 1, s20 +; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], s12 +; GCN-NEXT: s_cmp_eq_u32 s20, 1 +; GCN-NEXT: s_cselect_b32 s20, s17, s19 +; GCN-NEXT: s_cselect_b32 s21, s16, s18 +; GCN-NEXT: s_cselect_b32 s22, s5, 0 +; GCN-NEXT: s_cselect_b32 s23, s4, 0 +; GCN-NEXT: s_sub_i32 s16, 64, s8 +; GCN-NEXT: s_lshr_b64 s[16:17], s[0:1], s16 ; GCN-NEXT: s_lshl_b64 s[18:19], s[2:3], s8 -; GCN-NEXT: s_lshl_b64 s[20:21], s[0:1], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s19, s19, s21 -; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 -; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec -; GCN-NEXT: s_cselect_b32 s9, s3, s19 -; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s3, s18, s20 -; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec -; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 -; GCN-NEXT: s_cselect_b32 s22, s2, s3 -; GCN-NEXT: s_and_b64 s[2:3], s[18:19], s[10:11] -; GCN-NEXT: s_sub_i32 s18, 64, s12 -; GCN-NEXT: s_sub_i32 s10, s12, 64 -; GCN-NEXT: s_lshr_b64 s[18:19], s[4:5], s18 -; GCN-NEXT: s_lshl_b64 s[20:21], s[6:7], s12 -; GCN-NEXT: s_lshl_b64 s[10:11], s[4:5], s10 -; GCN-NEXT: s_or_b64 s[18:19], s[20:21], s[18:19] -; GCN-NEXT: s_and_b64 s[20:21], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s11, s19, s11 -; GCN-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] -; GCN-NEXT: v_cmp_eq_u64_e64 s[14:15], s[14:15], 0 -; GCN-NEXT: s_and_b64 s[20:21], s[14:15], exec -; GCN-NEXT: s_cselect_b32 s13, s7, s11 -; GCN-NEXT: s_and_b64 s[20:21], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s7, s18, s10 -; GCN-NEXT: s_and_b64 s[10:11], s[14:15], exec -; GCN-NEXT: s_cselect_b32 s10, s6, s7 +; GCN-NEXT: s_or_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: v_cmp_lt_u64_e64 s[18:19], s[8:9], 64 +; GCN-NEXT: s_sub_i32 s4, s8, 64 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19] +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 +; GCN-NEXT: s_lshl_b64 s[4:5], s[0:1], s4 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[18:19] +; GCN-NEXT: v_readfirstlane_b32 s18, v1 +; GCN-NEXT: v_readfirstlane_b32 s19, v0 +; GCN-NEXT: s_and_b32 s18, s18, s19 +; GCN-NEXT: s_and_b32 s18, 1, s18 ; GCN-NEXT: s_lshl_b64 s[0:1], s[0:1], s8 -; GCN-NEXT: s_and_b64 s[6:7], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s6, s1, 0 -; GCN-NEXT: s_cselect_b32 s7, s0, 0 -; GCN-NEXT: s_lshl_b64 s[0:1], s[4:5], s12 -; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec -; GCN-NEXT: s_cselect_b32 s1, s1, 0 -; GCN-NEXT: s_cselect_b32 s0, s0, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v2, s10 -; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: s_cmp_eq_u32 s18, 1 +; GCN-NEXT: s_cselect_b32 s4, s16, s4 +; GCN-NEXT: s_cselect_b32 s5, s17, s5 +; GCN-NEXT: s_cselect_b32 s16, s1, 0 +; GCN-NEXT: s_cselect_b32 s17, s0, 0 +; GCN-NEXT: s_or_b64 s[0:1], s[8:9], s[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_mov_b32_e32 v0, s23 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s3, s3, s5 +; GCN-NEXT: s_cselect_b32 s2, s2, s4 +; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_mov_b32_e32 v1, s22 +; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GCN-NEXT: s_cselect_b32 s0, s6, s21 +; GCN-NEXT: s_cselect_b32 s1, s7, s20 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: v_mov_b32_e32 v3, s1 ; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s7 -; GCN-NEXT: v_mov_b32_e32 v1, s6 -; GCN-NEXT: v_mov_b32_e32 v2, s22 -; GCN-NEXT: v_mov_b32_e32 v3, s9 +; GCN-NEXT: v_mov_b32_e32 v0, s17 +; GCN-NEXT: v_mov_b32_e32 v1, s16 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = shl <2 x i128> %lhs, %rhs @@ -523,62 +551,66 @@ define amdgpu_kernel void @s_lshr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-NEXT: s_sub_i32 s22, 64, s8 -; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: s_sub_i32 s20, 64, s12 +; GCN-NEXT: s_lshr_b64 s[16:17], s[4:5], s12 +; GCN-NEXT: s_lshl_b64 s[20:21], s[6:7], s20 +; GCN-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] +; GCN-NEXT: v_cmp_lt_u64_e64 s[20:21], s[12:13], 64 +; GCN-NEXT: s_sub_i32 s18, s12, 64 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[20:21] +; GCN-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 0 +; GCN-NEXT: s_lshr_b64 s[18:19], s[6:7], s18 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[20:21] +; GCN-NEXT: v_readfirstlane_b32 s20, v1 +; GCN-NEXT: v_readfirstlane_b32 s21, v0 +; GCN-NEXT: s_and_b32 s20, s20, s21 +; GCN-NEXT: s_and_b32 s20, 1, s20 +; GCN-NEXT: s_lshr_b64 s[6:7], s[6:7], s12 +; GCN-NEXT: s_cmp_eq_u32 s20, 1 +; GCN-NEXT: s_cselect_b32 s20, s17, s19 +; GCN-NEXT: s_cselect_b32 s21, s16, s18 +; GCN-NEXT: s_cselect_b32 s22, s7, 0 +; GCN-NEXT: s_cselect_b32 s23, s6, 0 +; GCN-NEXT: s_sub_i32 s16, 64, s8 +; GCN-NEXT: s_lshl_b64 s[16:17], s[2:3], s16 ; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 -; GCN-NEXT: s_lshr_b64 s[20:21], s[2:3], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s19, s19, s21 -; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 -; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec -; GCN-NEXT: s_cselect_b32 s9, s1, s19 -; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s1, s18, s20 -; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec -; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 -; GCN-NEXT: s_cselect_b32 s22, s0, s1 -; GCN-NEXT: s_and_b64 s[0:1], s[18:19], s[10:11] -; GCN-NEXT: s_sub_i32 s18, 64, s12 -; GCN-NEXT: s_sub_i32 s10, s12, 64 -; GCN-NEXT: s_lshl_b64 s[18:19], s[6:7], s18 -; GCN-NEXT: s_lshr_b64 s[20:21], s[4:5], s12 -; GCN-NEXT: s_lshr_b64 s[10:11], s[6:7], s10 -; GCN-NEXT: s_or_b64 s[18:19], s[20:21], s[18:19] -; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s11, s19, s11 -; GCN-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] -; GCN-NEXT: v_cmp_eq_u64_e64 s[14:15], s[14:15], 0 -; GCN-NEXT: s_and_b64 s[20:21], s[14:15], exec -; GCN-NEXT: s_cselect_b32 s13, s5, s11 -; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s5, s18, s10 -; GCN-NEXT: s_and_b64 s[10:11], s[14:15], exec -; GCN-NEXT: s_cselect_b32 s10, s4, s5 +; GCN-NEXT: s_or_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: v_cmp_lt_u64_e64 s[18:19], s[8:9], 64 +; GCN-NEXT: s_sub_i32 s6, s8, 64 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19] +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 +; GCN-NEXT: s_lshr_b64 s[6:7], s[2:3], s6 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[18:19] +; GCN-NEXT: v_readfirstlane_b32 s18, v1 +; GCN-NEXT: v_readfirstlane_b32 s19, v0 +; GCN-NEXT: s_and_b32 s18, s18, s19 +; GCN-NEXT: s_and_b32 s18, 1, s18 ; GCN-NEXT: s_lshr_b64 s[2:3], s[2:3], s8 -; GCN-NEXT: s_and_b64 s[4:5], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s4, s3, 0 -; GCN-NEXT: s_cselect_b32 s5, s2, 0 -; GCN-NEXT: s_lshr_b64 s[2:3], s[6:7], s12 +; GCN-NEXT: s_cmp_eq_u32 s18, 1 +; GCN-NEXT: s_cselect_b32 s6, s16, s6 +; GCN-NEXT: s_cselect_b32 s7, s17, s7 +; GCN-NEXT: s_cselect_b32 s16, s3, 0 +; GCN-NEXT: s_cselect_b32 s17, s2, 0 +; GCN-NEXT: s_or_b64 s[2:3], s[8:9], s[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_mov_b32_e32 v2, s23 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, s1, s7 +; GCN-NEXT: s_cselect_b32 s3, s0, s6 +; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_mov_b32_e32 v3, s22 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s3, 0 -; GCN-NEXT: s_cselect_b32 s1, s2, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_cselect_b32 s0, s4, s21 +; GCN-NEXT: s_cselect_b32 s1, s5, s20 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_mov_b32_e32 v3, s16 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = lshr <2 x i128> %lhs, %rhs @@ -598,64 +630,68 @@ define amdgpu_kernel void @s_ashr_v2i128_ss(<2 x i128> %lhs, <2 x i128> %rhs) { ; GCN-NEXT: v_mov_b32_e32 v7, 0 ; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_cmp_lt_u64_e64 s[16:17], s[8:9], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-NEXT: s_sub_i32 s22, 64, s8 -; GCN-NEXT: s_sub_i32 s20, s8, 64 -; GCN-NEXT: s_lshl_b64 s[22:23], s[2:3], s22 -; GCN-NEXT: s_and_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: s_sub_i32 s20, 64, s12 +; GCN-NEXT: s_lshr_b64 s[16:17], s[4:5], s12 +; GCN-NEXT: s_lshl_b64 s[20:21], s[6:7], s20 +; GCN-NEXT: s_or_b64 s[16:17], s[16:17], s[20:21] +; GCN-NEXT: v_cmp_lt_u64_e64 s[20:21], s[12:13], 64 +; GCN-NEXT: s_sub_i32 s18, s12, 64 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[20:21] +; GCN-NEXT: v_cmp_eq_u64_e64 s[20:21], s[14:15], 0 +; GCN-NEXT: s_ashr_i64 s[18:19], s[6:7], s18 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[20:21] +; GCN-NEXT: v_readfirstlane_b32 s20, v1 +; GCN-NEXT: v_readfirstlane_b32 s21, v0 +; GCN-NEXT: s_and_b32 s20, s20, s21 +; GCN-NEXT: s_and_b32 s20, 1, s20 +; GCN-NEXT: s_ashr_i32 s21, s7, 31 +; GCN-NEXT: s_ashr_i64 s[6:7], s[6:7], s12 +; GCN-NEXT: s_cmp_eq_u32 s20, 1 +; GCN-NEXT: s_cselect_b32 s20, s17, s19 +; GCN-NEXT: s_cselect_b32 s22, s16, s18 +; GCN-NEXT: s_cselect_b32 s23, s7, s21 +; GCN-NEXT: s_cselect_b32 s21, s6, s21 +; GCN-NEXT: s_sub_i32 s16, 64, s8 +; GCN-NEXT: s_lshl_b64 s[16:17], s[2:3], s16 ; GCN-NEXT: s_lshr_b64 s[18:19], s[0:1], s8 -; GCN-NEXT: s_ashr_i64 s[20:21], s[2:3], s20 -; GCN-NEXT: s_or_b64 s[18:19], s[18:19], s[22:23] -; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s19, s19, s21 -; GCN-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], 0 -; GCN-NEXT: s_and_b64 s[22:23], s[10:11], exec -; GCN-NEXT: s_cselect_b32 s9, s1, s19 -; GCN-NEXT: s_and_b64 s[22:23], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s1, s18, s20 -; GCN-NEXT: s_and_b64 s[10:11], s[10:11], exec -; GCN-NEXT: v_cmp_lt_u64_e64 s[10:11], s[12:13], 64 -; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 -; GCN-NEXT: s_cselect_b32 s22, s0, s1 -; GCN-NEXT: s_and_b64 s[0:1], s[18:19], s[10:11] -; GCN-NEXT: s_sub_i32 s18, 64, s12 -; GCN-NEXT: s_sub_i32 s10, s12, 64 -; GCN-NEXT: s_lshl_b64 s[18:19], s[6:7], s18 -; GCN-NEXT: s_lshr_b64 s[20:21], s[4:5], s12 -; GCN-NEXT: s_ashr_i64 s[10:11], s[6:7], s10 -; GCN-NEXT: s_or_b64 s[18:19], s[20:21], s[18:19] -; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s11, s19, s11 -; GCN-NEXT: s_or_b64 s[14:15], s[12:13], s[14:15] -; GCN-NEXT: v_cmp_eq_u64_e64 s[14:15], s[14:15], 0 -; GCN-NEXT: s_and_b64 s[20:21], s[14:15], exec -; GCN-NEXT: s_cselect_b32 s13, s5, s11 -; GCN-NEXT: s_and_b64 s[20:21], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s5, s18, s10 -; GCN-NEXT: s_and_b64 s[10:11], s[14:15], exec -; GCN-NEXT: s_cselect_b32 s10, s4, s5 -; GCN-NEXT: s_ashr_i32 s11, s3, 31 +; GCN-NEXT: s_or_b64 s[16:17], s[18:19], s[16:17] +; GCN-NEXT: v_cmp_lt_u64_e64 s[18:19], s[8:9], 64 +; GCN-NEXT: s_sub_i32 s6, s8, 64 +; GCN-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19] +; GCN-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 +; GCN-NEXT: s_ashr_i64 s[6:7], s[2:3], s6 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[18:19] +; GCN-NEXT: v_readfirstlane_b32 s18, v1 +; GCN-NEXT: v_readfirstlane_b32 s19, v0 +; GCN-NEXT: s_and_b32 s18, s18, s19 +; GCN-NEXT: s_and_b32 s18, 1, s18 +; GCN-NEXT: s_ashr_i32 s19, s3, 31 ; GCN-NEXT: s_ashr_i64 s[2:3], s[2:3], s8 -; GCN-NEXT: s_and_b64 s[4:5], s[16:17], exec -; GCN-NEXT: s_cselect_b32 s4, s3, s11 -; GCN-NEXT: s_cselect_b32 s5, s2, s11 -; GCN-NEXT: s_ashr_i32 s8, s7, 31 -; GCN-NEXT: s_ashr_i64 s[2:3], s[6:7], s12 +; GCN-NEXT: s_cmp_eq_u32 s18, 1 +; GCN-NEXT: s_cselect_b32 s6, s16, s6 +; GCN-NEXT: s_cselect_b32 s7, s17, s7 +; GCN-NEXT: s_cselect_b32 s16, s3, s19 +; GCN-NEXT: s_cselect_b32 s17, s2, s19 +; GCN-NEXT: s_or_b64 s[2:3], s[8:9], s[10:11] +; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[2:3], 0 +; GCN-NEXT: v_mov_b32_e32 v2, s21 +; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec +; GCN-NEXT: s_cselect_b32 s2, s1, s7 +; GCN-NEXT: s_cselect_b32 s3, s0, s6 +; GCN-NEXT: s_or_b64 s[0:1], s[12:13], s[14:15] +; GCN-NEXT: v_cmp_eq_u64_e64 s[0:1], s[0:1], 0 +; GCN-NEXT: v_mov_b32_e32 v3, s23 ; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec -; GCN-NEXT: s_cselect_b32 s0, s3, s8 -; GCN-NEXT: s_cselect_b32 s1, s2, s8 -; GCN-NEXT: v_mov_b32_e32 v0, s10 -; GCN-NEXT: v_mov_b32_e32 v1, s13 -; GCN-NEXT: v_mov_b32_e32 v2, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s0 +; GCN-NEXT: s_cselect_b32 s0, s4, s22 +; GCN-NEXT: s_cselect_b32 s1, s5, s20 +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-NEXT: s_nop 0 -; GCN-NEXT: v_mov_b32_e32 v0, s22 -; GCN-NEXT: v_mov_b32_e32 v1, s9 -; GCN-NEXT: v_mov_b32_e32 v2, s5 -; GCN-NEXT: v_mov_b32_e32 v3, s4 +; GCN-NEXT: v_mov_b32_e32 v0, s3 +; GCN-NEXT: v_mov_b32_e32 v1, s2 +; GCN-NEXT: v_mov_b32_e32 v2, s17 +; GCN-NEXT: v_mov_b32_e32 v3, s16 ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm %shift = ashr <2 x i128> %lhs, %rhs diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll index e5047cfa0b4e9..16f90e9ea09b3 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cfg-loop-assert.ll @@ -10,8 +10,11 @@ define amdgpu_kernel void @test(i32 %arg, i32 %arg1) { ; CHECK-NEXT: s_cselect_b64 s[2:3], -1, 0 ; CHECK-NEXT: s_cmp_eq_u32 s1, 0 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_or_b64 s[0:1], s[2:3], s[0:1] -; CHECK-NEXT: s_and_b64 vcc, exec, s[0:1] +; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; CHECK-NEXT: v_or_b32_e32 v0, v0, v1 +; CHECK-NEXT: v_and_b32_e32 v0, 1, v0 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; CHECK-NEXT: s_cbranch_vccnz .LBB0_3 ; CHECK-NEXT: ; %bb.1: ; %bb9 ; CHECK-NEXT: s_and_b64 vcc, exec, 0 diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll index 34de1e48bfb59..f590365e826a9 100644 --- a/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll +++ b/llvm/test/CodeGen/AMDGPU/si-annotate-nested-control-flows.ll @@ -18,9 +18,10 @@ define void @nested_inf_loop(i1 %0, i1 %1) { ; ISA: ; %bb.0: ; %BB ; ISA-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; ISA-NEXT: v_and_b32_e32 v1, 1, v1 -; ISA-NEXT: v_and_b32_e32 v0, 1, v0 +; ISA-NEXT: v_not_b32_e32 v0, v0 ; ISA-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v1 -; ISA-NEXT: v_cmp_ne_u32_e64 s[6:7], 1, v0 +; ISA-NEXT: v_and_b32_e32 v0, 1, v0 +; ISA-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v0 ; ISA-NEXT: s_mov_b64 s[8:9], 0 ; ISA-NEXT: .LBB0_1: ; %BB1 ; ISA-NEXT: ; =>This Loop Header: Depth=1 diff --git a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll index 761ff7786b98e..85f6805e87adf 100644 --- a/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll +++ b/llvm/test/CodeGen/AMDGPU/si-optimize-vgpr-live-range-dbg-instr.ll @@ -13,8 +13,9 @@ define void @__omp_offloading_35_36570d3__ZN6openmc31process_advance_particle_ev ; GCN-NEXT: v_mov_b32_e32 v1, 0 ; GCN-NEXT: v_mov_b32_e32 v2, 0 ; GCN-NEXT: global_load_dwordx2 v[1:2], v[1:2], off +; GCN-NEXT: v_not_b32_e32 v0, v0 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 -; GCN-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GCN-NEXT: s_and_saveexec_b64 s[4:5], vcc ; GCN-NEXT: s_xor_b64 s[4:5], exec, s[4:5] ; GCN-NEXT: s_cbranch_execnz .LBB0_3 diff --git a/llvm/test/CodeGen/AMDGPU/sign_extend.ll b/llvm/test/CodeGen/AMDGPU/sign_extend.ll index af78768520d23..a17ded645d2da 100644 --- a/llvm/test/CodeGen/AMDGPU/sign_extend.ll +++ b/llvm/test/CodeGen/AMDGPU/sign_extend.ll @@ -264,8 +264,10 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; SI-NEXT: s_cmp_eq_u32 s2, s3 ; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; SI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_and_b32_e32 v0, v1, v0 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -280,8 +282,10 @@ define amdgpu_kernel void @s_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 ; VI-NEXT: s_cmp_eq_u32 s2, s3 ; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 -; VI-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-NEXT: v_and_b32_e32 v0, v1, v0 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %cmp0 = icmp eq i32 %a, %b @@ -301,10 +305,12 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; SI-NEXT: s_mov_b32 s6, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_cmp_eq_u32 s1, s2 +; SI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; SI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; SI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; SI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v0, v0, v1 +; SI-NEXT: v_bfe_i32 v0, v0, 0, 1 ; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -316,10 +322,12 @@ define amdgpu_kernel void @v_sext_i1_to_i16_with_and(ptr addrspace(1) %out, i32 ; VI-NEXT: s_mov_b32 s6, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_eq_u32 s1, s2 +; VI-NEXT: s_cselect_b64 s[2:3], -1, 0 ; VI-NEXT: v_cmp_eq_u32_e32 vcc, s0, v0 -; VI-NEXT: s_cselect_b64 s[0:1], -1, 0 -; VI-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[0:1] +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; VI-NEXT: v_and_b32_e32 v0, v0, v1 +; VI-NEXT: v_bfe_i32 v0, v0, 0, 1 ; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm %tid = tail call i32 @llvm.amdgcn.workitem.id.x() #1 diff --git a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll index a5299ea36958d..f337ff545393d 100644 --- a/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll +++ b/llvm/test/CodeGen/AMDGPU/simplifydemandedbits-recursion.ll @@ -27,14 +27,14 @@ define amdgpu_kernel void @foo(ptr addrspace(1) noalias nocapture readonly %arg, ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_bitcmp1_b32 s6, 8 ; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0 -; CHECK-NEXT: s_bitcmp1_b32 s6, 16 +; CHECK-NEXT: s_not_b32 s4, s6 ; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_bitcmp1_b32 s4, 16 ; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v2 -; CHECK-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; CHECK-NEXT: s_cselect_b64 s[4:5], -1, 0 +; CHECK-NEXT: s_not_b32 s2, s2 ; CHECK-NEXT: s_bitcmp1_b32 s2, 24 ; CHECK-NEXT: s_cselect_b64 s[6:7], -1, 0 -; CHECK-NEXT: s_xor_b64 s[6:7], s[6:7], -1 ; CHECK-NEXT: s_bitcmp1_b32 s3, 0 ; CHECK-NEXT: s_cselect_b64 s[8:9], -1, 0 ; CHECK-NEXT: s_bitcmp1_b32 s10, 8 diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll index 0b68a0534fa08..4702845591999 100644 --- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll @@ -378,114 +378,132 @@ entry: define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_sint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_sint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s10 -; VI-NEXT: s_mov_b32 s13, s11 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s15, s7 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc -; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s[0:1] +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s0, s8 -; VI-NEXT: s_mov_b32 s1, s9 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 -; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 -; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 -; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, s0 +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, -1.0, vcc_lo ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll index c9e5ff444f715..2fecf3c60dc13 100644 --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -122,26 +122,34 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; ; GCN-IR-LABEL: s_test_srem: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[2:3] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-IR-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-IR-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-IR-NEXT: s_or_b32 s11, s8, s9 ; GCN-IR-NEXT: s_sub_u32 s12, s10, s18 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[8:9], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-IR-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-IR-NEXT: s_or_b32 s11, s11, s8 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15] +; GCN-IR-NEXT: v_or_b32_e32 v0, s11, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 @@ -153,47 +161,47 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) { ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s16, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s4, s18 -; GCN-IR-NEXT: s_addc_u32 s11, s5, 0 +; GCN-IR-NEXT: s_add_u32 s16, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 +; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] +; GCN-IR-NEXT: s_add_u32 s10, s6, s18 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s4, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s4, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s4, 31 +; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s4, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_and_b32 s6, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s12, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s6, s9 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s7, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s5, s8 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s6, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 @@ -343,28 +351,36 @@ define i64 @v_test_srem(i64 %x, i64 %y) { ; GCN-IR-NEXT: v_xor_b32_e32 v3, v3, v4 ; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, v2, v4 ; GCN-IR-NEXT: v_subb_u32_e32 v3, vcc, v3, v4, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_mov_b32_e32 v15, v14 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v5, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_xor_b32_e32 v8, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_mov_b32_e32 v15, v14 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 @@ -1051,7 +1067,7 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; GCN-IR-NEXT: s_mov_b32 s13, 0 +; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_ashr_i64 s[2:3], s[2:3], 31 ; GCN-IR-NEXT: s_ashr_i64 s[8:9], s[4:5], 31 @@ -1065,82 +1081,91 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 % ; GCN-IR-NEXT: s_xor_b64 s[8:9], s[8:9], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s8, s8, s2 ; GCN-IR-NEXT: s_subb_u32 s9, s9, s2 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[8:9], 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[6:7], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[8:9] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11] -; GCN-IR-NEXT: s_flbit_i32_b64 s20, s[6:7] -; GCN-IR-NEXT: s_sub_u32 s14, s12, s20 -; GCN-IR-NEXT: s_subb_u32 s15, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[16:17], s[14:15], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 63 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[10:11], s[16:17] -; GCN-IR-NEXT: s_and_b64 s[10:11], s[16:17], exec -; GCN-IR-NEXT: s_cselect_b32 s11, 0, s7 -; GCN-IR-NEXT: s_cselect_b32 s10, 0, s6 -; GCN-IR-NEXT: s_or_b64 s[16:17], s[16:17], s[18:19] -; GCN-IR-NEXT: s_mov_b64 s[2:3], 0 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[16:17] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[8:9], 0 +; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] +; GCN-IR-NEXT: v_readfirstlane_b32 s2, v1 +; GCN-IR-NEXT: v_readfirstlane_b32 s3, v0 +; GCN-IR-NEXT: s_or_b32 s13, s2, s3 +; GCN-IR-NEXT: s_sub_u32 s2, s12, s16 +; GCN-IR-NEXT: s_subb_u32 s3, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[2:3], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[2:3], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15] +; GCN-IR-NEXT: v_readfirstlane_b32 s14, v0 +; GCN-IR-NEXT: s_or_b32 s13, s13, s14 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[18:19] +; GCN-IR-NEXT: v_or_b32_e32 v0, s13, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s13, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GCN-IR-NEXT: s_cselect_b32 s15, 0, s7 +; GCN-IR-NEXT: s_cselect_b32 s14, 0, s6 +; GCN-IR-NEXT: s_mov_b32 s13, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s16, s14, 1 -; GCN-IR-NEXT: s_addc_u32 s17, s15, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[16:17], 0 -; GCN-IR-NEXT: s_sub_i32 s14, 63, s14 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[6:7], s14 +; GCN-IR-NEXT: s_add_u32 s14, s2, 1 +; GCN-IR-NEXT: s_addc_u32 s15, s3, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[14:15], 0 +; GCN-IR-NEXT: s_sub_i32 s2, 63, s2 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[18:19] +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], s2 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s16 +; GCN-IR-NEXT: s_lshr_b64 s[14:15], s[6:7], s14 ; GCN-IR-NEXT: s_add_u32 s18, s8, -1 ; GCN-IR-NEXT: s_addc_u32 s19, s9, -1 -; GCN-IR-NEXT: s_not_b64 s[2:3], s[12:13] -; GCN-IR-NEXT: s_add_u32 s12, s2, s20 -; GCN-IR-NEXT: s_addc_u32 s13, s3, 0 +; GCN-IR-NEXT: s_not_b64 s[10:11], s[12:13] +; GCN-IR-NEXT: s_add_u32 s12, s10, s16 +; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 ; GCN-IR-NEXT: s_mov_b64 s[16:17], 0 -; GCN-IR-NEXT: s_mov_b32 s3, 0 +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: .LBB8_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[14:15], s[14:15], 1 -; GCN-IR-NEXT: s_lshr_b32 s2, s11, 31 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[2:3] -; GCN-IR-NEXT: s_or_b64 s[10:11], s[16:17], s[10:11] -; GCN-IR-NEXT: s_sub_u32 s2, s18, s14 -; GCN-IR-NEXT: s_subb_u32 s2, s19, s15 -; GCN-IR-NEXT: s_ashr_i32 s16, s2, 31 +; GCN-IR-NEXT: s_lshr_b32 s10, s3, 31 +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[10:11] +; GCN-IR-NEXT: s_or_b64 s[2:3], s[16:17], s[2:3] +; GCN-IR-NEXT: s_sub_u32 s10, s18, s14 +; GCN-IR-NEXT: s_subb_u32 s10, s19, s15 +; GCN-IR-NEXT: s_ashr_i32 s16, s10, 31 ; GCN-IR-NEXT: s_mov_b32 s17, s16 -; GCN-IR-NEXT: s_and_b32 s2, s16, 1 +; GCN-IR-NEXT: s_and_b32 s10, s16, 1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[16:17], s[8:9] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s16 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s17 ; GCN-IR-NEXT: s_add_u32 s12, s12, 1 ; GCN-IR-NEXT: s_addc_u32 s13, s13, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[20:21], s[12:13], 0 -; GCN-IR-NEXT: s_mov_b64 s[16:17], s[2:3] +; GCN-IR-NEXT: s_mov_b64 s[16:17], s[10:11] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[20:21] ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow7 -; GCN-IR-NEXT: s_lshl_b64 s[10:11], s[10:11], 1 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[2:3], s[10:11] +; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[2:3], 1 +; GCN-IR-NEXT: s_or_b64 s[14:15], s[10:11], s[2:3] ; GCN-IR-NEXT: .LBB8_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s14 ; GCN-IR-NEXT: v_mul_hi_u32 v0, s8, v0 -; GCN-IR-NEXT: s_mul_i32 s11, s8, s11 -; GCN-IR-NEXT: s_mul_i32 s9, s9, s10 -; GCN-IR-NEXT: s_mul_i32 s8, s8, s10 -; GCN-IR-NEXT: v_readfirstlane_b32 s12, v0 -; GCN-IR-NEXT: s_add_i32 s11, s12, s11 -; GCN-IR-NEXT: s_add_i32 s11, s11, s9 -; GCN-IR-NEXT: s_sub_u32 s6, s6, s8 -; GCN-IR-NEXT: s_subb_u32 s7, s7, s11 -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[6:7], s[4:5] -; GCN-IR-NEXT: s_sub_u32 s4, s6, s4 -; GCN-IR-NEXT: s_subb_u32 s5, s7, s5 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s4 +; GCN-IR-NEXT: s_mul_i32 s10, s8, s15 +; GCN-IR-NEXT: s_mul_i32 s9, s9, s14 +; GCN-IR-NEXT: s_mul_i32 s8, s8, s14 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s10, v0 +; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s9, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s8 +; GCN-IR-NEXT: v_mov_b32_e32 v2, s7 +; GCN-IR-NEXT: v_sub_i32_e32 v1, vcc, s6, v1 +; GCN-IR-NEXT: v_subb_u32_e32 v0, vcc, v2, v0, vcc +; GCN-IR-NEXT: v_xor_b32_e32 v1, s4, v1 +; GCN-IR-NEXT: v_xor_b32_e32 v2, s5, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v3, s5 +; GCN-IR-NEXT: v_subrev_i32_e32 v0, vcc, s4, v1 ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s5 +; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v2, v3, vcc ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %1 = ashr i64 %x, 31 @@ -1349,17 +1374,23 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_xor_b64 s[2:3], s[2:3], s[8:9] ; GCN-IR-NEXT: s_sub_u32 s4, s2, s8 ; GCN-IR-NEXT: s_subb_u32 s5, s3, s8 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[2:3], s[4:5], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] ; GCN-IR-NEXT: s_add_u32 s2, s12, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[2:3], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[2:3], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[8:9], s[10:11] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[10:11], exec +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[8:9], s[2:3], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[2:3], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-IR-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s9, v1 +; GCN-IR-NEXT: s_or_b32 s9, s8, s9 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v0, s9, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s9, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] ; GCN-IR-NEXT: s_mov_b32 s9, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 @@ -1536,18 +1567,25 @@ define i64 @v_test_srem_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffc5 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: s_movk_i32 s4, 0xffc5 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v10 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, s[4:5] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 24, 0, vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB11_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1727,19 +1765,26 @@ define i64 @v_test_srem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: s_movk_i32 s6, 0xffd0 -; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s6, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: s_movk_i32 s4, 0xffd0 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, s4, v10 +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v7, 0, vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1831,21 +1876,28 @@ define i64 @v_test_srem_pow2_k_den_i64(i64 %x) { ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, v0, v12 ; GCN-IR-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 48, v10 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] ; GCN-IR-NEXT: v_mov_b32_e32 v13, v12 -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB13_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/ssubsat.ll index 40d80f5e83e36..dc8f72a0e763a 100644 --- a/llvm/test/CodeGen/AMDGPU/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/ssubsat.ll @@ -82,11 +82,15 @@ define i16 @v_ssubsat_i16(i16 %lhs, i16 %rhs) { ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i16: @@ -121,24 +125,32 @@ define i32 @v_ssubsat_i32(i32 %lhs, i32 %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 -; GFX6-NEXT: v_sub_i32_e64 v1, s[4:5], v0, v1 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v1, vcc, v0, v1 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 -; GFX8-NEXT: v_sub_u32_e64 v1, s[4:5], v0, v1 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v1, vcc, v0, v1 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v2, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_i32: @@ -183,19 +195,27 @@ define <2 x i16> @v_ssubsat_v2i16(<2 x i16> %lhs, <2 x i16> %rhs) { ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v4, v3, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v4, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v2, v3 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v4 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v5, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v1 ; GFX8-NEXT: v_sub_u16_e32 v1, v0, v1 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v1, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v1 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v1, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v1 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v1, v3, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -246,25 +266,37 @@ define <3 x i16> @v_ssubsat_v3i16(<3 x i16> %lhs, <3 x i16> %rhs) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, 0xffff8000, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v3, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v5, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v5, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v3, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v3, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v3, 0xffff8000, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v4 ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] @@ -323,37 +355,53 @@ define <2 x float> @v_ssubsat_v4i16(<4 x i16> %lhs, <4 x i16> %rhs) { ; GFX8-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GFX8-NEXT: v_sub_u16_e32 v6, v5, v4 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v6, v5 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v6 -; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v4, v5 +; GFX8-NEXT: v_ashrrev_i16_e32 v7, 15, v6 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_xor_b32_e32 v7, 0xffff8000, v7 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v6, v7, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 ; GFX8-NEXT: v_sub_u16_e32 v2, v0, v2 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i16_e32 v0, 15, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v5, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i16_e32 v5, 15, v2 +; GFX8-NEXT: v_xor_b32_e32 v5, 0xffff8000, v5 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 16, v4 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v5, vcc ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: v_lshrrev_b32_e32 v2, 16, v3 ; GFX8-NEXT: v_lshrrev_b32_e32 v4, 16, v1 ; GFX8-NEXT: v_sub_u16_e32 v5, v4, v2 ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v5, v4 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], 0, v2 -; GFX8-NEXT: v_ashrrev_i16_e32 v2, 15, v5 -; GFX8-NEXT: v_xor_b32_e32 v2, 0xffff8000, v2 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v2, v4 +; GFX8-NEXT: v_ashrrev_i16_e32 v6, 15, v5 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_xor_b32_e32 v6, 0xffff8000, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v6, vcc ; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, 0, v3 ; GFX8-NEXT: v_sub_u16_e32 v3, v1, v3 -; GFX8-NEXT: v_cmp_lt_i16_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i16_e32 v1, 15, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0xffff8000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i16_e32 vcc, v3, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i16_e32 v4, 15, v3 +; GFX8-NEXT: v_xor_b32_e32 v4, 0xffff8000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 ; GFX8-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -380,38 +428,54 @@ define <2 x i32> @v_ssubsat_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 -; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v0, v2 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v2, s[4:5], v1, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v2, vcc, v0, v2 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v4, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v1, v3 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v2i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v2 -; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v0, v2 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v2, s[4:5], v1, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v2, vcc, v0, v2 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v2, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v4, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v2 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v2, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v2 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v1, v3 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v2, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v2i32: @@ -436,52 +500,76 @@ define <3 x i32> @v_ssubsat_v3i32(<3 x i32> %lhs, <3 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v0, v3 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v1, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v3, vcc, v0, v3 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v6, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v3, s[4:5], v2, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v1, v4 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v2, v5 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, v3, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v3i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v3 -; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v0, v3 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v1, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v3, vcc, v0, v3 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v3, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v6, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v3 +; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v3, v6, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v3, s[4:5], v2, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v1, v4 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v3, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v3, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v3, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v2, v5 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v3, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v3, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v3i32: @@ -508,66 +596,98 @@ define <4 x i32> @v_ssubsat_v4i32(<4 x i32> %lhs, <4 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v0, v4 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v1, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v4 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v8, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v2, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v1, v5 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, v4, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX6-NEXT: v_sub_i32_e64 v4, s[4:5], v3, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v2, v6 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v5, vcc, v3, v7 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v5, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v4i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v4 -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v0, v4 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v1, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v4 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v4, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v8, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v4 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v5 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v2, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v1, v5 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v4, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v4, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v6 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v4, s[4:5], v3, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v2, v6 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v4, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v7 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v4, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v4 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v5, vcc, v3, v7 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v5, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v5 +; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v4i32: @@ -596,122 +716,186 @@ define <8 x i32> @v_ssubsat_v8i32(<8 x i32> %lhs, <8 x i32> %rhs) { ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v0, v8 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v1, v9 +; GFX6-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v8, vcc, v0, v8 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v8, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v8 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v8, v16, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v2, v10 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v1, v9 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, v8, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v3, v11 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v2, v10 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, v8, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v4, v12 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v3, v11 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v3, v8, v3 +; GFX6-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v5, v13 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v4, v12 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v4, v8, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v6, v14 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v5, v13 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v5, v8, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX6-NEXT: v_sub_i32_e64 v8, s[4:5], v7, v15 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v6, v14 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v6, v8, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v9, vcc, v7, v15 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v9, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v7, v8, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v8i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v8 -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v0, v8 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v1, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v8, vcc, v0, v8 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v8, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v8 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v8, v16, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v9 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v8, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v2, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v1, v9 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v8, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v10 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v3, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v2, v10 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v8, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v11 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v8, v3, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v4, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v3, v11 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, v8, v3 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v12 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v5, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v4, v12 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v8, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v13 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v6, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v5, v13 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v5, v8, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v14 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v8, s[4:5], v7, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v6, v14 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v6, v8, v6 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v9, v8, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v15 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v8, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v8 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v9, vcc, v7, v15 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v9, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, v8, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v9 +; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v9, v8, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v8i32: @@ -747,239 +931,367 @@ define <16 x i32> @v_ssubsat_v16i32(<16 x i32> %lhs, <16 x i32> %rhs) { ; GFX6-LABEL: v_ssubsat_v16i32: ; GFX6: ; %bb.0: ; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_sub_i32_e32 v31, vcc, v0, v16 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v0, v16 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX6-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v1, v17 +; GFX6-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v31, v0 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v31 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v31, v16, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v1, v17 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v2, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v1 +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v1, v17, v1 +; GFX6-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX6-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v2, v18 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX6-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v3, v19 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v2 +; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v2, v17, v2 +; GFX6-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX6-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v3, v19 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX6-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v4, v20 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v3 +; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v3, v17, v3 +; GFX6-NEXT: buffer_load_dword v18, off, s[0:3], s32 +; GFX6-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX6-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v4, v20 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX6-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX6-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v5, v21 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v4 +; GFX6-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v4, v17, v4 +; GFX6-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v5, v21 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX6-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v6, v22 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v5, v17, v5 +; GFX6-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX6-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v6, v22 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v7, v23 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v6 +; GFX6-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v6, v17, v6 +; GFX6-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v7, v23 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX6-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v8, v24 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v7 +; GFX6-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v7, v17, v7 +; GFX6-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX6-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v8, v24 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX6-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v9, v25 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v8 +; GFX6-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v8, v17, v8 +; GFX6-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX6-NEXT: v_cndmask_b32_e32 v8, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v9, v25 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX6-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v10, v26 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v9 +; GFX6-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v9, v17, v9 +; GFX6-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX6-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v10, v26 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX6-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v11, v27 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v10 +; GFX6-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v10, v17, v10 +; GFX6-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX6-NEXT: v_cndmask_b32_e32 v10, v16, v17, vcc +; GFX6-NEXT: v_sub_i32_e32 v16, vcc, v11, v27 ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 -; GFX6-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v12, v28 +; GFX6-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v16, v11 +; GFX6-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v11, v17, v11 +; GFX6-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX6-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX6-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX6-NEXT: v_cndmask_b32_e32 v11, v16, v17, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 -; GFX6-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v13, v29 +; GFX6-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v12, v28 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v17, v12 +; GFX6-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v12, v16, v12 +; GFX6-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX6-NEXT: v_cndmask_b32_e32 v12, v17, v16, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 -; GFX6-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc -; GFX6-NEXT: v_sub_i32_e64 v17, s[4:5], v14, v30 +; GFX6-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v13, v29 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v17, v13 +; GFX6-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v13, v16, v13 +; GFX6-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX6-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc ; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 -; GFX6-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX6-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v14, v30 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v17, v14 +; GFX6-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v14, v16, v14 +; GFX6-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX6-NEXT: v_cndmask_b32_e32 v14, v17, v16, vcc ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX6-NEXT: v_sub_i32_e64 v16, s[4:5], v15, v16 -; GFX6-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX6-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX6-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 -; GFX6-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX6-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX6-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX6-NEXT: v_sub_i32_e32 v17, vcc, v15, v18 +; GFX6-NEXT: v_cmp_lt_i32_e32 vcc, v17, v15 +; GFX6-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v15, v16, v15 +; GFX6-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX6-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX6-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX6-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_ssubsat_v16i32: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_sub_u32_e32 v31, vcc, v0, v16 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v0, v16 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v0 -; GFX8-NEXT: v_ashrrev_i32_e32 v0, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v1, v17 +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v31, v0 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v16, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v31 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v31, v16, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v1, v17 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v17 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v1 -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v1, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v2, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v1 +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v1, v17, v1 +; GFX8-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX8-NEXT: v_cndmask_b32_e32 v1, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v2, v18 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v2 -; GFX8-NEXT: v_ashrrev_i32_e32 v2, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v2, 0x80000000, v2 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v3, v19 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v2 +; GFX8-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, v17, v2 +; GFX8-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GFX8-NEXT: v_cndmask_b32_e32 v2, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v3, v19 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v19 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v3 -; GFX8-NEXT: v_ashrrev_i32_e32 v3, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v3, 0x80000000, v3 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v3, vcc -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v4, v20 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v3 +; GFX8-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v3, v17, v3 +; GFX8-NEXT: buffer_load_dword v18, off, s[0:3], s32 +; GFX8-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v4, v20 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v20 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v4 -; GFX8-NEXT: v_ashrrev_i32_e32 v4, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v4, 0x80000000, v4 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v4, vcc -; GFX8-NEXT: buffer_load_dword v16, off, s[0:3], s32 -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v5, v21 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v4 +; GFX8-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v4, v17, v4 +; GFX8-NEXT: v_and_b32_e32 v4, 1, v4 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v5, v21 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v21 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v5 -; GFX8-NEXT: v_ashrrev_i32_e32 v5, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v5, 0x80000000, v5 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v5, v17, v5, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v6, v22 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v5, v17, v5 +; GFX8-NEXT: v_and_b32_e32 v5, 1, v5 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 +; GFX8-NEXT: v_cndmask_b32_e32 v5, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v6, v22 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v22 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v6 -; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v6, 0x80000000, v6 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v6, v17, v6, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v7, v23 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v6 +; GFX8-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v6, v17, v6 +; GFX8-NEXT: v_and_b32_e32 v6, 1, v6 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v6, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v7, v23 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v23 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v7 -; GFX8-NEXT: v_ashrrev_i32_e32 v7, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v7, 0x80000000, v7 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v7, v17, v7, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v8, v24 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v7 +; GFX8-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v7, v17, v7 +; GFX8-NEXT: v_and_b32_e32 v7, 1, v7 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7 +; GFX8-NEXT: v_cndmask_b32_e32 v7, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v8, v24 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v24 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v8 -; GFX8-NEXT: v_ashrrev_i32_e32 v8, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v8, 0x80000000, v8 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v8, v17, v8, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v9, v25 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v8 +; GFX8-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v8, v17, v8 +; GFX8-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GFX8-NEXT: v_cndmask_b32_e32 v8, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v9, v25 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v25 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v9 -; GFX8-NEXT: v_ashrrev_i32_e32 v9, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v9, 0x80000000, v9 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v9, v17, v9, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v10, v26 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v9 +; GFX8-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v9, v17, v9 +; GFX8-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v10, v26 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v26 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v10 -; GFX8-NEXT: v_ashrrev_i32_e32 v10, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v10, 0x80000000, v10 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v10, v17, v10, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v11, v27 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v10 +; GFX8-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v10, v17, v10 +; GFX8-NEXT: v_and_b32_e32 v10, 1, v10 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; GFX8-NEXT: v_cndmask_b32_e32 v10, v16, v17, vcc +; GFX8-NEXT: v_sub_u32_e32 v16, vcc, v11, v27 ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v27 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v11 -; GFX8-NEXT: v_ashrrev_i32_e32 v11, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v11, 0x80000000, v11 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v11, v17, v11, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v12, v28 +; GFX8-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v16, v11 +; GFX8-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v11, v17, v11 +; GFX8-NEXT: v_and_b32_e32 v11, 1, v11 +; GFX8-NEXT: v_ashrrev_i32_e32 v17, 31, v16 +; GFX8-NEXT: v_xor_b32_e32 v17, 0x80000000, v17 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v11, v16, v17, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v28 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v12 -; GFX8-NEXT: v_ashrrev_i32_e32 v12, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v12, 0x80000000, v12 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v12, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v13, v29 +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v12, v28 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v17, v12 +; GFX8-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v12, v16, v12 +; GFX8-NEXT: v_and_b32_e32 v12, 1, v12 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v12 +; GFX8-NEXT: v_cndmask_b32_e32 v12, v17, v16, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v29 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v13 -; GFX8-NEXT: v_ashrrev_i32_e32 v13, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v13, 0x80000000, v13 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v13, vcc -; GFX8-NEXT: v_sub_u32_e64 v17, s[4:5], v14, v30 +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v13, v29 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v17, v13 +; GFX8-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v13, v16, v13 +; GFX8-NEXT: v_and_b32_e32 v13, 1, v13 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v13 +; GFX8-NEXT: v_cndmask_b32_e32 v13, v17, v16, vcc ; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v30 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v17, v14 -; GFX8-NEXT: v_ashrrev_i32_e32 v14, 31, v17 -; GFX8-NEXT: v_xor_b32_e32 v14, 0x80000000, v14 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v14, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v14, v30 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v17, v14 +; GFX8-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v14, v16, v14 +; GFX8-NEXT: v_and_b32_e32 v14, 1, v14 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v14 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v17, v16, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v16 -; GFX8-NEXT: v_sub_u32_e64 v16, s[4:5], v15, v16 -; GFX8-NEXT: v_cmp_lt_i32_e64 s[4:5], v16, v15 -; GFX8-NEXT: v_ashrrev_i32_e32 v15, 31, v16 -; GFX8-NEXT: v_xor_b32_e32 v15, 0x80000000, v15 -; GFX8-NEXT: s_xor_b64 vcc, vcc, s[4:5] -; GFX8-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, 0, v18 +; GFX8-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GFX8-NEXT: v_sub_u32_e32 v17, vcc, v15, v18 +; GFX8-NEXT: v_cmp_lt_i32_e32 vcc, v17, v15 +; GFX8-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v15, v16, v15 +; GFX8-NEXT: v_and_b32_e32 v15, 1, v15 +; GFX8-NEXT: v_ashrrev_i32_e32 v16, 31, v17 +; GFX8-NEXT: v_xor_b32_e32 v16, 0x80000000, v16 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v15 +; GFX8-NEXT: v_cndmask_b32_e32 v15, v17, v16, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_ssubsat_v16i32: @@ -1062,11 +1374,15 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX6-NEXT: v_sub_i32_e32 v4, vcc, v0, v2 ; GFX6-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX6-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX6-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX6-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX6-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX6-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX6-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX6-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX6-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX6-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX6-NEXT: s_setpc_b64 s[30:31] ; @@ -1076,11 +1392,15 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX8-NEXT: v_sub_u32_e32 v4, vcc, v0, v2 ; GFX8-NEXT: v_subb_u32_e32 v5, vcc, v1, v3, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX8-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX8-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX8-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX8-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX8-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX8-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX8-NEXT: s_setpc_b64 s[30:31] ; @@ -1090,11 +1410,15 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2 ; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1] -; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3] -; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v5 -; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc -; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc -; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 +; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GFX9-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, 0, v[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc ; GFX9-NEXT: s_setpc_b64 s[30:31] ; @@ -1103,13 +1427,17 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX10-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX10-NEXT: v_sub_co_ci_u32_e32 v5, vcc_lo, v1, v3, vcc_lo -; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3] -; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX10-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: v_ssubsat_i64: @@ -1117,12 +1445,16 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) { ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v0, v2 ; GFX11-NEXT: v_sub_co_ci_u32_e64 v5, null, v1, v3, vcc_lo -; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3] -; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5 ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v6 -; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v5 +; GFX11-NEXT: v_xor_b32_e32 v2, 0x80000000, v1 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v1 :: v_dual_cndmask_b32 v1, v5, v2 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result diff --git a/llvm/test/CodeGen/AMDGPU/sub_i1.ll b/llvm/test/CodeGen/AMDGPU/sub_i1.ll index 19d012fc074f8..0aee3f27dec39 100644 --- a/llvm/test/CodeGen/AMDGPU/sub_i1.ll +++ b/llvm/test/CodeGen/AMDGPU/sub_i1.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE64 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,WAVE32 %s @@ -7,6 +8,20 @@ ; WAVE32: v_xor_b32_e32 ; WAVE64: v_xor_b32_e32 define amdgpu_kernel void @sub_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; WAVE64-LABEL: sub_var_var_i1: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; WAVE64-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 +; WAVE64-NEXT: v_mov_b32_e32 v0, 0 +; WAVE64-NEXT: s_waitcnt lgkmcnt(0) +; WAVE64-NEXT: global_load_ubyte v1, v0, s[2:3] glc +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: global_load_ubyte v2, v0, s[6:7] glc +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: v_xor_b32_e32 v1, v1, v2 +; WAVE64-NEXT: v_and_b32_e32 v1, 1, v1 +; WAVE64-NEXT: global_store_byte v0, v1, s[0:1] +; WAVE64-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in0 %b = load volatile i1, ptr addrspace(1) %in1 %sub = sub i1 %a, %b @@ -18,6 +33,17 @@ define amdgpu_kernel void @sub_var_var_i1(ptr addrspace(1) %out, ptr addrspace(1 ; WAVE32: s_xor_b32 ; WAVE64: s_xor_b64 define amdgpu_kernel void @sub_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; WAVE64-LABEL: sub_var_imm_i1: +; WAVE64: ; %bb.0: +; WAVE64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; WAVE64-NEXT: v_mov_b32_e32 v0, 0 +; WAVE64-NEXT: s_waitcnt lgkmcnt(0) +; WAVE64-NEXT: global_load_ubyte v1, v0, s[2:3] glc +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: v_not_b32_e32 v1, v1 +; WAVE64-NEXT: v_and_b32_e32 v1, 1, v1 +; WAVE64-NEXT: global_store_byte v0, v1, s[0:1] +; WAVE64-NEXT: s_endpgm %a = load volatile i1, ptr addrspace(1) %in %sub = sub i1 %a, 1 store i1 %sub, ptr addrspace(1) %out @@ -29,6 +55,44 @@ define amdgpu_kernel void @sub_var_imm_i1(ptr addrspace(1) %out, ptr addrspace(1 ; WAVE32: s_xor_b32 ; WAVE64: s_xor_b64 define amdgpu_kernel void @sub_i1_cf(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; WAVE64-LABEL: sub_i1_cf: +; WAVE64: ; %bb.0: ; %entry +; WAVE64-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; WAVE64-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; WAVE64-NEXT: v_cmp_lt_u32_e32 vcc, 15, v0 +; WAVE64-NEXT: ; implicit-def: $sgpr4_sgpr5 +; WAVE64-NEXT: s_and_saveexec_b64 s[6:7], vcc +; WAVE64-NEXT: s_xor_b64 s[6:7], exec, s[6:7] +; WAVE64-NEXT: s_cbranch_execz .LBB2_2 +; WAVE64-NEXT: ; %bb.1: ; %else +; WAVE64-NEXT: v_mov_b32_e32 v0, 0 +; WAVE64-NEXT: s_waitcnt lgkmcnt(0) +; WAVE64-NEXT: global_load_ubyte v0, v0, s[8:9] glc +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: v_and_b32_e32 v0, 1, v0 +; WAVE64-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v0 +; WAVE64-NEXT: .LBB2_2: ; %Flow +; WAVE64-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7] +; WAVE64-NEXT: s_cbranch_execz .LBB2_4 +; WAVE64-NEXT: ; %bb.3: ; %if +; WAVE64-NEXT: v_mov_b32_e32 v0, 0 +; WAVE64-NEXT: s_waitcnt lgkmcnt(0) +; WAVE64-NEXT: global_load_ubyte v0, v0, s[2:3] glc +; WAVE64-NEXT: s_waitcnt vmcnt(0) +; WAVE64-NEXT: s_andn2_b64 s[2:3], s[4:5], exec +; WAVE64-NEXT: v_and_b32_e32 v0, 1, v0 +; WAVE64-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; WAVE64-NEXT: s_and_b64 s[4:5], vcc, exec +; WAVE64-NEXT: s_or_b64 s[4:5], s[2:3], s[4:5] +; WAVE64-NEXT: .LBB2_4: ; %endif +; WAVE64-NEXT: s_or_b64 exec, exec, s[6:7] +; WAVE64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; WAVE64-NEXT: v_not_b32_e32 v1, v1 +; WAVE64-NEXT: v_mov_b32_e32 v0, 0 +; WAVE64-NEXT: v_and_b32_e32 v1, 1, v1 +; WAVE64-NEXT: s_waitcnt lgkmcnt(0) +; WAVE64-NEXT: global_store_byte v0, v1, s[0:1] +; WAVE64-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() %d_cmp = icmp ult i32 %tid, 16 @@ -50,3 +114,6 @@ endif: } declare i32 @llvm.amdgcn.workitem.id.x() +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; GCN: {{.*}} +; WAVE32: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll index e5bc4c5721b90..8b4df164f4a8f 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-cmp-constant.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=SI -check-prefix=FUNC %s @@ -172,3 +173,6 @@ define amdgpu_kernel void @masked_load_i1_to_i32_trunc_cmp_ne_neg1(ptr addrspace store i1 %cmp, ptr addrspace(1) %out ret void } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; FUNC: {{.*}} +; SI: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll index dd3499ed4dd68..04903e79da375 100644 --- a/llvm/test/CodeGen/AMDGPU/trunc-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/trunc-combine.ll @@ -308,7 +308,8 @@ define <2 x i16> @vector_trunc_high_bits_undef_or_lhs_alignbit_regression(i32 %a ; VI-LABEL: vector_trunc_high_bits_undef_or_lhs_alignbit_regression: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_or_b32_e32 v0, 0xffff0011, v0 +; VI-NEXT: v_or_b32_e32 v0, 17, v0 +; VI-NEXT: v_or_b32_e32 v0, 0xffff0000, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = or <2 x i32> %undef.hi.elt, splat (i32 17) @@ -368,8 +369,7 @@ define <2 x i16> @vector_trunc_high_bits_undef_mul_lhs_alignbit_regression(i32 % ; VI-LABEL: vector_trunc_high_bits_undef_mul_lhs_alignbit_regression: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_mul_lo_u32 v0, v0, 18 -; VI-NEXT: v_and_b32_e32 v0, 0xfffe, v0 +; VI-NEXT: v_mul_lo_u16_e32 v0, 18, v0 ; VI-NEXT: s_setpc_b64 s[30:31] %undef.hi.elt = insertelement <2 x i32> poison, i32 %arg0, i32 0 %lshr = mul <2 x i32> %undef.hi.elt, splat (i32 18) diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll index d23e314b9465f..1d355beaeb498 100644 --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -55,27 +55,29 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS1-NEXT: s_not_b32 s8, s54 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v1 +; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s8, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s4, 0 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s5, 1 +; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: s_not_b32 s6, s6 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: s_bitcmp1_b32 s7, 0 +; GLOBALNESS1-NEXT: s_not_b32 s6, s7 +; GLOBALNESS1-NEXT: s_bitcmp1_b32 s6, 0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS1-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS1-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS1-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 0 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e64 s[70:71], 1, v3 ; GLOBALNESS1-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS1-NEXT: s_mov_b32 s82, s16 @@ -116,10 +118,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS1-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS1-NEXT: .LBB1_4: ; %bb5 @@ -143,35 +145,35 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[70:71] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS1-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[70:71] +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS1-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 -; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], -1 +; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS1-NEXT: s_cmp_lt_i32 s55, 1 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS1-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS1-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 1 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_7: ; %Flow26 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS1-NEXT: ; %bb.8: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_cmp_lg_u32 s55, 0 -; GLOBALNESS1-NEXT: s_mov_b64 s[8:9], 0 -; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS1-NEXT: s_mov_b64 s[10:11], 0 +; GLOBALNESS1-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS1-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS1-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS1-NEXT: ; %bb.10: ; %baz.exit.i @@ -186,8 +188,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s8, 10 -; GLOBALNESS1-NEXT: v_writelane_b32 v59, s9, 11 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s10, 10 +; GLOBALNESS1-NEXT: v_writelane_b32 v59, s11, 11 ; GLOBALNESS1-NEXT: v_readlane_b32 s4, v59, 2 ; GLOBALNESS1-NEXT: v_readlane_b32 s5, v59, 3 ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[4:5] @@ -278,12 +280,12 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_load_dwordx4 s[4:7], s[38:39], 0x0 ; GLOBALNESS1-NEXT: v_readlane_b32 s70, v59, 8 -; GLOBALNESS1-NEXT: v_readlane_b32 s8, v59, 10 +; GLOBALNESS1-NEXT: v_readlane_b32 s10, v59, 10 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS1-NEXT: v_readlane_b32 s71, v59, 9 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_mov_b32 s55, s7 -; GLOBALNESS1-NEXT: v_readlane_b32 s9, v59, 11 +; GLOBALNESS1-NEXT: v_readlane_b32 s11, v59, 11 ; GLOBALNESS1-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: s_or_b64 exec, exec, s[52:53] @@ -306,7 +308,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS1-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_29: ; %loop.exit.guard -; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[10:11] ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS1-NEXT: s_cbranch_vccz .LBB1_31 ; GLOBALNESS1-NEXT: ; %bb.30: ; %bb7.i.i @@ -369,27 +371,29 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] +; GLOBALNESS0-NEXT: s_not_b32 s8, s54 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v1 +; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s8, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s4, 0 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s5, 1 +; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: s_not_b32 s6, s6 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[50:51], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: s_bitcmp1_b32 s7, 0 +; GLOBALNESS0-NEXT: s_not_b32 s6, s7 +; GLOBALNESS0-NEXT: s_bitcmp1_b32 s6, 0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[64:65], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 -; GLOBALNESS0-NEXT: s_xor_b64 s[4:5], s[4:5], -1 -; GLOBALNESS0-NEXT: s_mov_b64 s[38:39], s[8:9] -; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[8:9], 1, v1 -; GLOBALNESS0-NEXT: ; implicit-def: $vgpr59 : SGPR spill to VGPR lane ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[66:67], 1, v0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 0 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[68:69], 1, v0 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e64 s[84:85], 1, v3 ; GLOBALNESS0-NEXT: v_mov_b32_e32 v46, 0x80 ; GLOBALNESS0-NEXT: s_mov_b32 s70, s16 @@ -430,10 +434,10 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_2: ; %Flow15 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[4:5] -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 ; GLOBALNESS0-NEXT: .LBB1_3: ; %Flow28 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[56:57], v[0:1], v[0:1] op_sel:[0,1] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_29 ; GLOBALNESS0-NEXT: .LBB1_4: ; %bb5 @@ -457,35 +461,35 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[84:85] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 -; GLOBALNESS0-NEXT: ; implicit-def: $sgpr8_sgpr9 +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[84:85] +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS0-NEXT: ; implicit-def: $sgpr10_sgpr11 ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.5: ; %NodeBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 -; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], -1 +; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 ; GLOBALNESS0-NEXT: s_cmp_lt_i32 s55, 1 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], -1 ; GLOBALNESS0-NEXT: s_cbranch_scc1 .LBB1_7 ; GLOBALNESS0-NEXT: ; %bb.6: ; %LeafBlock12 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 1 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_7: ; %Flow26 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[6:7] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[8:9] ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_9 ; GLOBALNESS0-NEXT: ; %bb.8: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_cmp_lg_u32 s55, 0 -; GLOBALNESS0-NEXT: s_mov_b64 s[8:9], 0 -; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 +; GLOBALNESS0-NEXT: s_mov_b64 s[10:11], 0 +; GLOBALNESS0-NEXT: s_cselect_b64 s[6:7], -1, 0 ; GLOBALNESS0-NEXT: .LBB1_9: ; %Flow25 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: s_mov_b64 s[6:7], -1 -; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[4:5] +; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[6:7] ; GLOBALNESS0-NEXT: ; implicit-def: $vgpr0_vgpr1 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_3 ; GLOBALNESS0-NEXT: ; %bb.10: ; %baz.exit.i @@ -500,8 +504,8 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb33.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: global_load_dwordx2 v[0:1], v[44:45], off -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s8, 10 -; GLOBALNESS0-NEXT: v_writelane_b32 v59, s9, 11 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s10, 10 +; GLOBALNESS0-NEXT: v_writelane_b32 v59, s11, 11 ; GLOBALNESS0-NEXT: v_readlane_b32 s4, v59, 2 ; GLOBALNESS0-NEXT: v_readlane_b32 s5, v59, 3 ; GLOBALNESS0-NEXT: s_mov_b32 s83, s55 @@ -592,11 +596,11 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: .LBB1_24: ; %Flow23 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_readlane_b32 s84, v59, 8 -; GLOBALNESS0-NEXT: v_readlane_b32 s8, v59, 10 +; GLOBALNESS0-NEXT: v_readlane_b32 s10, v59, 10 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b32 s55, s83 ; GLOBALNESS0-NEXT: v_readlane_b32 s85, v59, 9 -; GLOBALNESS0-NEXT: v_readlane_b32 s9, v59, 11 +; GLOBALNESS0-NEXT: v_readlane_b32 s11, v59, 11 ; GLOBALNESS0-NEXT: .LBB1_25: ; %Flow24 ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: s_or_b64 exec, exec, s[52:53] @@ -619,7 +623,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %arg1.global, i1 %tmp3.i.i, i ; GLOBALNESS0-NEXT: global_store_dwordx2 v[44:45], v[42:43], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_29: ; %loop.exit.guard -; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[8:9] +; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[10:11] ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], -1 ; GLOBALNESS0-NEXT: s_cbranch_vccz .LBB1_31 ; GLOBALNESS0-NEXT: ; %bb.30: ; %bb7.i.i diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll index 6606b1d050421..8a34e82b10b34 100644 --- a/llvm/test/CodeGen/AMDGPU/uaddo.ll +++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll @@ -628,7 +628,10 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; SI-NEXT: .LBB8_2: ; %exit ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -655,7 +658,10 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_not_b32_e32 v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; VI-NEXT: .LBB8_2: ; %exit ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] @@ -678,7 +684,10 @@ define amdgpu_kernel void @s_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; GFX9-NEXT: .LBB8_2: ; %exit ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -726,7 +735,10 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v1 ; SI-NEXT: .LBB9_2: ; %exit ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 @@ -756,7 +768,10 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: v_add_u32_e64 v0, s[0:1], v1, v2 ; VI-NEXT: s_cbranch_vccnz .LBB9_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-NEXT: v_not_b32_e32 v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1 ; VI-NEXT: .LBB9_2: ; %exit ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -780,7 +795,10 @@ define amdgpu_kernel void @v_uaddo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: v_add_co_u32_e64 v1, s[0:1], v2, v3 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_not_b32_e32 v2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v2 ; GFX9-NEXT: .LBB9_2: ; %exit ; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll index 5acbb044c1057..de7a02eeffaa8 100644 --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -123,73 +123,83 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s16, s[2:3] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] -; GCN-IR-NEXT: s_sub_u32 s12, s10, s16 -; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec -; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[18:19] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-IR-NEXT: s_sub_u32 s8, s10, s16 +; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[12:13], s[8:9], 63 +; GCN-IR-NEXT: v_or_b32_e32 v0, v1, v0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[12:13] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 63 +; GCN-IR-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[12:13] +; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-IR-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s14, s12, 1 -; GCN-IR-NEXT: s_addc_u32 s15, s13, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[14:15], 0 -; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s12 +; GCN-IR-NEXT: s_add_u32 s12, s8, 1 +; GCN-IR-NEXT: s_addc_u32 s13, s9, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 0 +; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[2:3], s8 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s14, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s15, s7, -1 +; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s12 +; GCN-IR-NEXT: s_add_u32 s14, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s15, s5, -1 ; GCN-IR-NEXT: s_not_b64 s[2:3], s[10:11] ; GCN-IR-NEXT: s_add_u32 s2, s2, s16 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s4, s14, s12 -; GCN-IR-NEXT: s_subb_u32 s4, s15, s13 -; GCN-IR-NEXT: s_ashr_i32 s10, s4, 31 +; GCN-IR-NEXT: s_sub_u32 s6, s14, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s15, s13 +; GCN-IR-NEXT: s_ashr_i32 s10, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s11, s10 -; GCN-IR-NEXT: s_and_b32 s4, s10, 1 -; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[6:7] +; GCN-IR-NEXT: s_and_b32 s6, s10, 1 +; GCN-IR-NEXT: s_and_b64 s[10:11], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s11 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[2:3], 0 -; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[16:17] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[2:3] -; GCN-IR-NEXT: .LBB0_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 +; GCN-IR-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_branch .LBB0_6 +; GCN-IR-NEXT: .LBB0_5: +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-IR-NEXT: .LBB0_6: ; %udiv-end ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, %y @@ -310,27 +320,35 @@ define i64 @v_test_udiv_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_udiv_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 -; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 -; GCN-IR-NEXT: v_min_u32_e32 v14, v4, v5 -; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 -; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 -; GCN-IR-NEXT: v_min_u32_e32 v15, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v8, s[6:7], v14, v15 +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v3 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v9, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[8:9] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v2 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-IR-NEXT: v_min_u32_e32 v14, v5, v6 +; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v0 +; GCN-IR-NEXT: v_add_i32_e32 v5, vcc, 32, v5 +; GCN-IR-NEXT: v_ffbh_u32_e32 v6, v1 +; GCN-IR-NEXT: v_min_u32_e32 v15, v5, v6 +; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, v14, v15 +; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[8:9] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[8:9] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v10, vcc, 1, v8 @@ -893,31 +911,32 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-LABEL: s_test_udiv_k_num_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] -; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc5 -; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], exec -; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] -; GCN-IR-NEXT: s_mov_b32 s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 +; GCN-IR-NEXT: s_add_u32 s6, s12, 0xffffffc5 +; GCN-IR-NEXT: s_addc_u32 s7, 0, -1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[6:7], 63 +; GCN-IR-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-IR-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s8 +; GCN-IR-NEXT: s_add_u32 s8, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s9, s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 0 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], 24, s6 ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s10 +; GCN-IR-NEXT: s_lshr_b64 s[10:11], 24, s8 ; GCN-IR-NEXT: s_add_u32 s14, s2, -1 ; GCN-IR-NEXT: s_addc_u32 s15, s3, -1 ; GCN-IR-NEXT: s_sub_u32 s8, 58, s12 @@ -947,12 +966,18 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB8_3 ; GCN-IR-NEXT: .LBB8_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; GCN-IR-NEXT: .LBB8_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_branch .LBB8_6 +; GCN-IR-NEXT: .LBB8_5: +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 24, 0, vcc +; GCN-IR-NEXT: .LBB8_6: ; %udiv-end ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 24, %x @@ -1068,17 +1093,24 @@ define i64 @v_test_udiv_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffd0, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v7, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v3, 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[4:5] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v5, 0x8000 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v2, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v3, v4, v3 +; GCN-IR-NEXT: v_xor_b32_e32 v4, 1, v3 +; GCN-IR-NEXT: v_and_b32_e32 v3, 1, v3 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v5, 0, vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 @@ -1154,20 +1186,27 @@ define i64 @v_test_udiv_pow2_k_den_i64(i64 %x) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 48, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 48, v10 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v4, 1, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB10_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 @@ -1262,31 +1301,32 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] -; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 -; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[2:3] ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec -; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 -; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] +; GCN-IR-NEXT: s_sub_u32 s6, 59, s10 +; GCN-IR-NEXT: s_subb_u32 s7, 0, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[4:5], s[6:7], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[6:7], 63 +; GCN-IR-NEXT: v_or_b32_e32 v0, v0, v1 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[4:5] +; GCN-IR-NEXT: v_or_b32_e32 v1, v0, v1 +; GCN-IR-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 -; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[10:11], 0 -; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[6:7] -; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s8 +; GCN-IR-NEXT: s_add_u32 s8, s6, 1 +; GCN-IR-NEXT: s_addc_u32 s9, s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[8:9], 0 +; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 +; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[12:13] +; GCN-IR-NEXT: s_lshl_b64 s[6:7], s[2:3], s6 ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader -; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s10 -; GCN-IR-NEXT: s_add_u32 s2, s12, 0xffffffc4 +; GCN-IR-NEXT: s_lshr_b64 s[8:9], s[2:3], s8 +; GCN-IR-NEXT: s_add_u32 s2, s10, 0xffffffc4 ; GCN-IR-NEXT: s_addc_u32 s3, 0, -1 ; GCN-IR-NEXT: s_mov_b64 s[10:11], 0 ; GCN-IR-NEXT: s_mov_b32 s5, 0 @@ -1312,12 +1352,20 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_cbranch_vccz .LBB11_3 ; GCN-IR-NEXT: .LBB11_4: ; %Flow6 ; GCN-IR-NEXT: s_lshl_b64 s[2:3], s[6:7], 1 -; GCN-IR-NEXT: s_or_b64 s[6:7], s[4:5], s[2:3] -; GCN-IR-NEXT: .LBB11_5: ; %udiv-end -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 +; GCN-IR-NEXT: s_or_b64 s[2:3], s[4:5], s[2:3] +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: s_branch .LBB11_6 +; GCN-IR-NEXT: .LBB11_5: +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc +; GCN-IR-NEXT: .LBB11_6: ; %udiv-end ; GCN-IR-NEXT: s_mov_b32 s3, 0xf000 ; GCN-IR-NEXT: s_mov_b32 s2, -1 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GCN-IR-NEXT: s_endpgm %result = udiv i64 %x, 24 @@ -1354,20 +1402,27 @@ define i64 @v_test_udiv_k_den_i64(i64 %x) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v6, s[4:5], 59, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 59, v10 +; GCN-IR-NEXT: v_subb_u32_e64 v7, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[6:7] +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[6:7] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v2, v3, v2 +; GCN-IR-NEXT: v_xor_b32_e32 v4, 1, v2 +; GCN-IR-NEXT: v_and_b32_e32 v2, 1, v2 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[6:7] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v2, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v3, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v4, v4, v5 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB12_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v6 diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll index eb1b844ad8938..e16787b29fd7b 100644 --- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll +++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll @@ -378,114 +378,132 @@ entry: define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: s_uint_to_fp_i1_to_f16: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s3, 0xf000 -; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: s_mov_b32 s6, s2 -; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s10, s6 +; SI-NEXT: s_mov_b32 s11, s7 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b32 s12, s10 -; SI-NEXT: s_mov_b32 s13, s11 -; SI-NEXT: s_mov_b32 s14, s2 -; SI-NEXT: s_mov_b32 s15, s3 -; SI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; SI-NEXT: s_mov_b32 s12, s2 +; SI-NEXT: s_mov_b32 s13, s3 +; SI-NEXT: s_mov_b32 s14, s6 +; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; SI-NEXT: s_mov_b32 s4, s0 +; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; SI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc -; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v0, v1, v0 +; SI-NEXT: v_and_b32_e32 v0, 1, v0 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: s_mov_b32 s0, s8 -; SI-NEXT: s_mov_b32 s1, s9 -; SI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm ; ; VI-LABEL: s_uint_to_fp_i1_to_f16: ; VI: ; %bb.0: -; VI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 -; VI-NEXT: s_mov_b32 s3, 0xf000 -; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: s_mov_b32 s6, s2 -; VI-NEXT: s_mov_b32 s7, s3 +; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; VI-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x34 +; VI-NEXT: s_mov_b32 s7, 0xf000 +; VI-NEXT: s_mov_b32 s6, -1 +; VI-NEXT: s_mov_b32 s10, s6 +; VI-NEXT: s_mov_b32 s11, s7 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_mov_b32 s12, s10 -; VI-NEXT: s_mov_b32 s13, s11 -; VI-NEXT: s_mov_b32 s14, s2 -; VI-NEXT: s_mov_b32 s15, s3 -; VI-NEXT: buffer_load_dword v0, off, s[4:7], 0 +; VI-NEXT: s_mov_b32 s12, s2 +; VI-NEXT: s_mov_b32 s13, s3 +; VI-NEXT: s_mov_b32 s14, s6 +; VI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; VI-NEXT: s_mov_b32 s15, s7 ; VI-NEXT: buffer_load_dword v1, off, s[12:15], 0 +; VI-NEXT: s_mov_b32 s4, s0 +; VI-NEXT: s_mov_b32 s5, s1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; VI-NEXT: s_xor_b64 s[0:1], s[0:1], vcc -; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v0, v1, v0 +; VI-NEXT: v_and_b32_e32 v0, 1, v0 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; VI-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; VI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; VI-NEXT: s_mov_b32 s0, s8 -; VI-NEXT: s_mov_b32 s1, s9 -; VI-NEXT: buffer_store_short v0, off, s[0:3], 0 +; VI-NEXT: buffer_store_short v0, off, s[4:7], 0 ; VI-NEXT: s_endpgm ; ; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11-TRUE16: ; %bb.0: ; GFX11-TRUE16-NEXT: s_clause 0x1 -; GFX11-TRUE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-TRUE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-TRUE16-NEXT: s_mov_b32 s6, -1 -; GFX11-TRUE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-TRUE16-NEXT: s_mov_b32 s2, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s3, s7 -; GFX11-TRUE16-NEXT: s_mov_b32 s14, s6 -; GFX11-TRUE16-NEXT: s_mov_b32 s15, s7 +; GFX11-TRUE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-TRUE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-TRUE16-NEXT: s_mov_b32 s10, -1 +; GFX11-TRUE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-TRUE16-NEXT: s_mov_b32 s6, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s7, s11 +; GFX11-TRUE16-NEXT: s_mov_b32 s14, s10 +; GFX11-TRUE16-NEXT: s_mov_b32 s15, s11 ; GFX11-TRUE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-TRUE16-NEXT: s_mov_b32 s12, s10 -; GFX11-TRUE16-NEXT: s_mov_b32 s13, s11 -; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-TRUE16-NEXT: s_mov_b32 s12, s2 +; GFX11-TRUE16-NEXT: s_mov_b32 s13, s3 +; GFX11-TRUE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-TRUE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-TRUE16-NEXT: s_mov_b32 s4, s8 -; GFX11-TRUE16-NEXT: s_mov_b32 s5, s9 +; GFX11-TRUE16-NEXT: s_mov_b32 s8, s0 +; GFX11-TRUE16-NEXT: s_mov_b32 s9, s1 ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-TRUE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-TRUE16-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-TRUE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-TRUE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v1 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-TRUE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-TRUE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-TRUE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX11-TRUE16-NEXT: v_cvt_f16_f32_e32 v0.l, v0 -; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-TRUE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-TRUE16-NEXT: s_endpgm ; ; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16: ; GFX11-FAKE16: ; %bb.0: ; GFX11-FAKE16-NEXT: s_clause 0x1 -; GFX11-FAKE16-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-FAKE16-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 -; GFX11-FAKE16-NEXT: s_mov_b32 s6, -1 -; GFX11-FAKE16-NEXT: s_mov_b32 s7, 0x31016000 -; GFX11-FAKE16-NEXT: s_mov_b32 s2, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s3, s7 -; GFX11-FAKE16-NEXT: s_mov_b32 s14, s6 -; GFX11-FAKE16-NEXT: s_mov_b32 s15, s7 +; GFX11-FAKE16-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-FAKE16-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 +; GFX11-FAKE16-NEXT: s_mov_b32 s10, -1 +; GFX11-FAKE16-NEXT: s_mov_b32 s11, 0x31016000 +; GFX11-FAKE16-NEXT: s_mov_b32 s6, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s7, s11 +; GFX11-FAKE16-NEXT: s_mov_b32 s14, s10 +; GFX11-FAKE16-NEXT: s_mov_b32 s15, s11 ; GFX11-FAKE16-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-FAKE16-NEXT: s_mov_b32 s12, s10 -; GFX11-FAKE16-NEXT: s_mov_b32 s13, s11 -; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[0:3], 0 +; GFX11-FAKE16-NEXT: s_mov_b32 s12, s2 +; GFX11-FAKE16-NEXT: s_mov_b32 s13, s3 +; GFX11-FAKE16-NEXT: buffer_load_b32 v0, off, s[4:7], 0 ; GFX11-FAKE16-NEXT: buffer_load_b32 v1, off, s[12:15], 0 -; GFX11-FAKE16-NEXT: s_mov_b32 s4, s8 -; GFX11-FAKE16-NEXT: s_mov_b32 s5, s9 +; GFX11-FAKE16-NEXT: s_mov_b32 s8, s0 +; GFX11-FAKE16-NEXT: s_mov_b32 s9, s1 ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(1) ; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 1.0, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) -; GFX11-FAKE16-NEXT: v_cmp_le_f32_e64 s0, 0, v1 -; GFX11-FAKE16-NEXT: s_xor_b32 s0, s0, vcc_lo -; GFX11-FAKE16-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX11-FAKE16-NEXT: v_cmp_le_f32_e32 vcc_lo, 0, v1 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-FAKE16-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) +; GFX11-FAKE16-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX11-FAKE16-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX11-FAKE16-NEXT: v_cvt_f16_f32_e32 v0, v0 -; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[4:7], 0 +; GFX11-FAKE16-NEXT: buffer_store_b16 v0, off, s[8:11], 0 ; GFX11-FAKE16-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 diff --git a/llvm/test/CodeGen/AMDGPU/uniform-select.ll b/llvm/test/CodeGen/AMDGPU/uniform-select.ll index f001bf0d5e498..688e74bc0fc49 100644 --- a/llvm/test/CodeGen/AMDGPU/uniform-select.ll +++ b/llvm/test/CodeGen/AMDGPU/uniform-select.ll @@ -7,47 +7,51 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX90A-LABEL: test_insert_extract: ; GFX90A: ; %bb.0: ; %entry -; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 -; GFX90A-NEXT: s_mov_b32 s2, 0 -; GFX90A-NEXT: s_and_b64 vcc, exec, -1 -; GFX90A-NEXT: s_mov_b32 s3, 0 +; GFX90A-NEXT: s_load_dwordx2 s[2:3], s[8:9], 0x0 ; GFX90A-NEXT: s_mov_b32 s4, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 +; GFX90A-NEXT: s_and_b64 vcc, exec, -1 ; GFX90A-NEXT: s_mov_b32 s5, 0 ; GFX90A-NEXT: s_mov_b32 s6, 0 +; GFX90A-NEXT: s_mov_b32 s7, 0 ; GFX90A-NEXT: .LBB0_1: ; %for.body ; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 -; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s4, s3 -; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 -; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s5, s7 -; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 -; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s7, s6, s7 -; GFX90A-NEXT: s_or_b32 s7, s7, s0 -; GFX90A-NEXT: s_cmp_eq_u32 s1, 1 -; GFX90A-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX90A-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s4, s7, s4 -; GFX90A-NEXT: s_cmp_eq_u32 s1, 3 -; GFX90A-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX90A-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX90A-NEXT: s_cselect_b32 s6, s7, s6 -; GFX90A-NEXT: s_cmp_eq_u32 s1, 2 -; GFX90A-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX90A-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX90A-NEXT: s_cselect_b32 s5, s7, s5 -; GFX90A-NEXT: s_cmp_eq_u32 s1, 0 -; GFX90A-NEXT: s_cselect_b32 s3, s7, s3 -; GFX90A-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] -; GFX90A-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX90A-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX90A-NEXT: s_cselect_b32 s2, 0, s2 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 1 +; GFX90A-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX90A-NEXT: s_cselect_b32 s8, s5, s4 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 2 +; GFX90A-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX90A-NEXT: s_cselect_b32 s8, s6, s8 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 3 +; GFX90A-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX90A-NEXT: s_cselect_b32 s0, s7, s8 +; GFX90A-NEXT: s_or_b32 s8, s0, s2 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 1 +; GFX90A-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX90A-NEXT: s_cselect_b32 s5, s8, s5 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 3 +; GFX90A-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX90A-NEXT: s_cselect_b32 s7, s8, s7 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 2 +; GFX90A-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX90A-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GFX90A-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX90A-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX90A-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX90A-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX90A-NEXT: s_cselect_b32 s6, s8, s6 +; GFX90A-NEXT: s_cmp_eq_u32 s3, 0 +; GFX90A-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GFX90A-NEXT: s_cselect_b32 s4, s8, s4 +; GFX90A-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; GFX90A-NEXT: s_mov_b64 vcc, vcc ; GFX90A-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX90A-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -55,47 +59,52 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; ; GFX942-LABEL: test_insert_extract: ; GFX942: ; %bb.0: ; %entry -; GFX942-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX942-NEXT: s_mov_b32 s2, 0 -; GFX942-NEXT: s_and_b64 vcc, exec, -1 -; GFX942-NEXT: s_mov_b32 s3, 0 +; GFX942-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX942-NEXT: s_mov_b32 s4, 0 +; GFX942-NEXT: v_mov_b32_e32 v0, 0 +; GFX942-NEXT: s_and_b64 vcc, exec, -1 ; GFX942-NEXT: s_mov_b32 s5, 0 ; GFX942-NEXT: s_mov_b32 s6, 0 +; GFX942-NEXT: s_mov_b32 s7, 0 ; GFX942-NEXT: .LBB0_1: ; %for.body ; GFX942-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX942-NEXT: s_waitcnt lgkmcnt(0) -; GFX942-NEXT: s_cmp_eq_u32 s1, 1 -; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s4, s3 -; GFX942-NEXT: s_cmp_eq_u32 s1, 2 -; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s5, s7 -; GFX942-NEXT: s_cmp_eq_u32 s1, 3 -; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s7, s6, s7 -; GFX942-NEXT: s_or_b32 s7, s7, s0 -; GFX942-NEXT: s_cmp_eq_u32 s1, 1 -; GFX942-NEXT: s_cselect_b64 s[8:9], -1, 0 -; GFX942-NEXT: s_and_b64 s[10:11], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s4, s7, s4 -; GFX942-NEXT: s_cmp_eq_u32 s1, 3 -; GFX942-NEXT: s_cselect_b64 s[10:11], -1, 0 -; GFX942-NEXT: s_and_b64 s[12:13], s[10:11], exec -; GFX942-NEXT: s_cselect_b32 s6, s7, s6 -; GFX942-NEXT: s_cmp_eq_u32 s1, 2 -; GFX942-NEXT: s_cselect_b64 s[12:13], -1, 0 -; GFX942-NEXT: s_and_b64 s[14:15], s[12:13], exec -; GFX942-NEXT: s_cselect_b32 s5, s7, s5 -; GFX942-NEXT: s_cmp_eq_u32 s1, 0 -; GFX942-NEXT: s_cselect_b32 s3, s7, s3 -; GFX942-NEXT: s_or_b64 s[8:9], s[12:13], s[8:9] -; GFX942-NEXT: s_or_b64 s[8:9], s[10:11], s[8:9] -; GFX942-NEXT: s_and_b64 s[8:9], s[8:9], exec -; GFX942-NEXT: s_cselect_b32 s2, 0, s2 +; GFX942-NEXT: s_cmp_eq_u32 s3, 1 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_cselect_b32 s8, s5, s4 +; GFX942-NEXT: s_cmp_eq_u32 s3, 2 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_cselect_b32 s8, s6, s8 +; GFX942-NEXT: s_cmp_eq_u32 s3, 3 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_cselect_b32 s0, s7, s8 +; GFX942-NEXT: s_or_b32 s8, s0, s2 +; GFX942-NEXT: s_cmp_eq_u32 s3, 1 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_cselect_b32 s5, s8, s5 +; GFX942-NEXT: s_cmp_eq_u32 s3, 3 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: s_cselect_b32 s7, s8, s7 +; GFX942-NEXT: s_cmp_eq_u32 s3, 2 +; GFX942-NEXT: s_cselect_b64 s[0:1], -1, 0 +; GFX942-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[0:1] +; GFX942-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX942-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX942-NEXT: s_and_b64 s[0:1], s[0:1], exec +; GFX942-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX942-NEXT: s_cselect_b32 s6, s8, s6 +; GFX942-NEXT: s_cmp_eq_u32 s3, 0 +; GFX942-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 +; GFX942-NEXT: s_cselect_b32 s4, s8, s4 +; GFX942-NEXT: s_nop 0 +; GFX942-NEXT: v_cndmask_b32_e64 v0, v0, 0, s[0:1] ; GFX942-NEXT: s_mov_b64 vcc, vcc ; GFX942-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX942-NEXT: ; %bb.2: ; %DummyReturnBlock @@ -104,102 +113,115 @@ define amdgpu_kernel void @test_insert_extract(i32 %p, i32 %q) { ; GFX1030-LABEL: test_insert_extract: ; GFX1030: ; %bb.0: ; %entry ; GFX1030-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GFX1030-NEXT: v_mov_b32_e32 v0, 0 ; GFX1030-NEXT: s_mov_b32 s2, 0 ; GFX1030-NEXT: s_mov_b32 s3, 0 ; GFX1030-NEXT: s_mov_b32 s4, 0 ; GFX1030-NEXT: s_mov_b32 s5, 0 -; GFX1030-NEXT: s_mov_b32 s6, 0 -; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX1030-NEXT: s_inst_prefetch 0x1 ; GFX1030-NEXT: .p2align 6 ; GFX1030-NEXT: .LBB0_1: ; %for.body ; GFX1030-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1030-NEXT: s_waitcnt lgkmcnt(0) ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 +; GFX1030-NEXT: s_cselect_b32 s6, -1, 0 +; GFX1030-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1030-NEXT: s_cselect_b32 s6, s3, s2 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 ; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s4, s3 -; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 -; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1030-NEXT: s_cselect_b32 s6, s4, s6 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 -; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1030-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s7, s6, s7 -; GFX1030-NEXT: s_or_b32 s7, s7, s0 +; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1030-NEXT: s_cselect_b32 s6, s5, s6 +; GFX1030-NEXT: s_or_b32 s6, s6, s0 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 1 -; GFX1030-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1030-NEXT: s_and_b32 s9, s8, exec_lo -; GFX1030-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 +; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1030-NEXT: s_cselect_b32 s3, s6, s3 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 3 -; GFX1030-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1030-NEXT: s_and_b32 s10, s9, exec_lo -; GFX1030-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 +; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1030-NEXT: s_cselect_b32 s5, s6, s5 ; GFX1030-NEXT: s_cmp_eq_u32 s1, 2 -; GFX1030-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1030-NEXT: s_and_b32 s11, s10, exec_lo -; GFX1030-NEXT: s_cselect_b32 s5, s7, s5 -; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 -; GFX1030-NEXT: s_cselect_b32 s3, s7, s3 -; GFX1030-NEXT: s_or_b32 s7, s10, s8 -; GFX1030-NEXT: s_or_b32 s7, s9, s7 +; GFX1030-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1030-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7 ; GFX1030-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1030-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1030-NEXT: s_cselect_b32 s4, s6, s4 +; GFX1030-NEXT: s_cmp_eq_u32 s1, 0 +; GFX1030-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX1030-NEXT: s_cselect_b32 s2, s6, s2 +; GFX1030-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX1030-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1030-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1030-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX1030-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX1030-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1030-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX1030-NEXT: s_inst_prefetch 0x2 ; GFX1030-NEXT: s_endpgm ; ; GFX1100-LABEL: test_insert_extract: ; GFX1100: ; %bb.0: ; %entry ; GFX1100-NEXT: s_load_b64 s[0:1], s[4:5], 0x0 +; GFX1100-NEXT: v_mov_b32_e32 v0, 0 ; GFX1100-NEXT: s_mov_b32 s2, 0 ; GFX1100-NEXT: s_mov_b32 s3, 0 ; GFX1100-NEXT: s_mov_b32 s4, 0 ; GFX1100-NEXT: s_mov_b32 s5, 0 -; GFX1100-NEXT: s_mov_b32 s6, 0 -; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo +; GFX1100-NEXT: s_set_inst_prefetch_distance 0x1 ; GFX1100-NEXT: .p2align 6 ; GFX1100-NEXT: .LBB0_1: ; %for.body ; GFX1100-NEXT: ; =>This Inner Loop Header: Depth=1 ; GFX1100-NEXT: s_waitcnt lgkmcnt(0) ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 -; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1100-NEXT: s_cselect_b32 s6, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s4, s3 +; GFX1100-NEXT: s_and_b32 s6, s6, exec_lo +; GFX1100-NEXT: s_cselect_b32 s6, s3, s2 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 -; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s5, s7 +; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-NEXT: s_cselect_b32 s6, s4, s6 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 -; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 +; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 ; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_b32 s8, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s7, s6, s7 -; GFX1100-NEXT: s_or_b32 s7, s7, s0 +; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-NEXT: s_cselect_b32 s6, s5, s6 +; GFX1100-NEXT: s_or_b32 s6, s6, s0 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 1 -; GFX1100-NEXT: s_cselect_b32 s8, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_b32 s9, s8, exec_lo -; GFX1100-NEXT: s_cselect_b32 s4, s7, s4 +; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v1, 0, 1, s7 +; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-NEXT: s_cselect_b32 s3, s6, s3 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 3 -; GFX1100-NEXT: s_cselect_b32 s9, -1, 0 -; GFX1100-NEXT: s_and_b32 s10, s9, exec_lo -; GFX1100-NEXT: s_cselect_b32 s6, s7, s6 +; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1100-NEXT: v_cndmask_b32_e64 v2, 0, 1, s7 +; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo +; GFX1100-NEXT: s_cselect_b32 s5, s6, s5 ; GFX1100-NEXT: s_cmp_eq_u32 s1, 2 -; GFX1100-NEXT: s_cselect_b32 s10, -1, 0 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) -; GFX1100-NEXT: s_and_b32 s11, s10, exec_lo -; GFX1100-NEXT: s_cselect_b32 s5, s7, s5 -; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 -; GFX1100-NEXT: s_cselect_b32 s3, s7, s3 -; GFX1100-NEXT: s_or_b32 s7, s10, s8 -; GFX1100-NEXT: s_or_b32 s7, s9, s7 -; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1100-NEXT: s_cselect_b32 s7, -1, 0 +; GFX1100-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_cndmask_b32_e64 v3, 0, 1, s7 ; GFX1100-NEXT: s_and_b32 s7, s7, exec_lo -; GFX1100-NEXT: s_cselect_b32 s2, 0, s2 +; GFX1100-NEXT: s_cselect_b32 s4, s6, s4 +; GFX1100-NEXT: s_cmp_eq_u32 s1, 0 +; GFX1100-NEXT: v_or_b32_e32 v1, v3, v1 +; GFX1100-NEXT: s_cselect_b32 s2, s6, s2 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX1100-NEXT: v_or_b32_e32 v1, v2, v1 +; GFX1100-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX1100-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1100-NEXT: v_cndmask_b32_e64 v0, v0, 0, vcc_lo +; GFX1100-NEXT: s_mov_b32 vcc_lo, exec_lo ; GFX1100-NEXT: s_cbranch_vccnz .LBB0_1 ; GFX1100-NEXT: ; %bb.2: ; %DummyReturnBlock +; GFX1100-NEXT: s_set_inst_prefetch_distance 0x2 ; GFX1100-NEXT: s_endpgm entry: %init = insertelement <4 x i32> zeroinitializer, i32 0, i64 0 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll index 94f1b83ea2765..b3ee489e691be 100644 --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -122,26 +122,34 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; ; GCN-IR-LABEL: s_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases -; GCN-IR-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0xd ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 -; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 -; GCN-IR-NEXT: s_mov_b32 s11, 0 +; GCN-IR-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; GCN-IR-NEXT: s_mov_b64 s[6:7], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[6:7], 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[12:13], s[2:3], 0 -; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[6:7] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[2:3], 0 +; GCN-IR-NEXT: s_flbit_i32_b64 s10, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[8:9], s[4:5], 0 ; GCN-IR-NEXT: s_flbit_i32_b64 s18, s[2:3] -; GCN-IR-NEXT: s_or_b64 s[8:9], s[8:9], s[12:13] +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[8:9] +; GCN-IR-NEXT: v_readfirstlane_b32 s8, v1 +; GCN-IR-NEXT: v_readfirstlane_b32 s9, v0 +; GCN-IR-NEXT: s_or_b32 s11, s8, s9 ; GCN-IR-NEXT: s_sub_u32 s12, s10, s18 ; GCN-IR-NEXT: s_subb_u32 s13, 0, 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[14:15], s[12:13], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[16:17], s[12:13], 63 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[8:9], s[14:15] -; GCN-IR-NEXT: s_and_b64 s[8:9], s[14:15], exec +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[8:9], s[12:13], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[12:13], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[8:9] +; GCN-IR-NEXT: v_readfirstlane_b32 s8, v0 +; GCN-IR-NEXT: s_or_b32 s11, s11, s8 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[14:15] +; GCN-IR-NEXT: v_or_b32_e32 v0, s11, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s11, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-IR-NEXT: s_cselect_b32 s9, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s8, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[14:15], s[14:15], s[16:17] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[14:15] +; GCN-IR-NEXT: s_mov_b32 s11, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s12, 1 @@ -153,47 +161,47 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_4 ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: s_lshr_b64 s[12:13], s[2:3], s14 -; GCN-IR-NEXT: s_add_u32 s16, s6, -1 -; GCN-IR-NEXT: s_addc_u32 s17, s7, -1 -; GCN-IR-NEXT: s_not_b64 s[4:5], s[10:11] -; GCN-IR-NEXT: s_add_u32 s10, s4, s18 -; GCN-IR-NEXT: s_addc_u32 s11, s5, 0 +; GCN-IR-NEXT: s_add_u32 s16, s4, -1 +; GCN-IR-NEXT: s_addc_u32 s17, s5, -1 +; GCN-IR-NEXT: s_not_b64 s[6:7], s[10:11] +; GCN-IR-NEXT: s_add_u32 s10, s6, s18 +; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 ; GCN-IR-NEXT: s_mov_b64 s[14:15], 0 -; GCN-IR-NEXT: s_mov_b32 s5, 0 +; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: .LBB0_3: ; %udiv-do-while ; GCN-IR-NEXT: ; =>This Inner Loop Header: Depth=1 ; GCN-IR-NEXT: s_lshl_b64 s[12:13], s[12:13], 1 -; GCN-IR-NEXT: s_lshr_b32 s4, s9, 31 +; GCN-IR-NEXT: s_lshr_b32 s6, s9, 31 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[4:5] +; GCN-IR-NEXT: s_or_b64 s[12:13], s[12:13], s[6:7] ; GCN-IR-NEXT: s_or_b64 s[8:9], s[14:15], s[8:9] -; GCN-IR-NEXT: s_sub_u32 s4, s16, s12 -; GCN-IR-NEXT: s_subb_u32 s4, s17, s13 -; GCN-IR-NEXT: s_ashr_i32 s14, s4, 31 +; GCN-IR-NEXT: s_sub_u32 s6, s16, s12 +; GCN-IR-NEXT: s_subb_u32 s6, s17, s13 +; GCN-IR-NEXT: s_ashr_i32 s14, s6, 31 ; GCN-IR-NEXT: s_mov_b32 s15, s14 -; GCN-IR-NEXT: s_and_b32 s4, s14, 1 -; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[6:7] +; GCN-IR-NEXT: s_and_b32 s6, s14, 1 +; GCN-IR-NEXT: s_and_b64 s[14:15], s[14:15], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s14 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s15 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[18:19], s[10:11], 0 -; GCN-IR-NEXT: s_mov_b64 s[14:15], s[4:5] +; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] ; GCN-IR-NEXT: s_and_b64 vcc, exec, s[18:19] ; GCN-IR-NEXT: s_cbranch_vccz .LBB0_3 ; GCN-IR-NEXT: .LBB0_4: ; %Flow7 ; GCN-IR-NEXT: s_lshl_b64 s[8:9], s[8:9], 1 -; GCN-IR-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9] +; GCN-IR-NEXT: s_or_b64 s[8:9], s[6:7], s[8:9] ; GCN-IR-NEXT: .LBB0_5: ; %udiv-end ; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 -; GCN-IR-NEXT: v_mul_hi_u32 v0, s6, v0 +; GCN-IR-NEXT: v_mul_hi_u32 v0, s4, v0 ; GCN-IR-NEXT: s_mov_b32 s12, s0 -; GCN-IR-NEXT: s_mul_i32 s0, s6, s9 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s9 ; GCN-IR-NEXT: v_mov_b32_e32 v2, s3 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s7, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s5, s8 ; GCN-IR-NEXT: v_add_i32_e32 v1, vcc, s0, v0 -; GCN-IR-NEXT: s_mul_i32 s0, s6, s8 +; GCN-IR-NEXT: s_mul_i32 s0, s4, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v0, s0 ; GCN-IR-NEXT: v_sub_i32_e32 v0, vcc, s2, v0 ; GCN-IR-NEXT: s_mov_b32 s15, 0xf000 @@ -319,27 +327,35 @@ define i64 @v_test_urem_i64(i64 %x, i64 %y) { ; GCN-IR-LABEL: v_test_urem_i64: ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v5, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v2 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v3 ; GCN-IR-NEXT: v_min_u32_e32 v12, v4, v5 ; GCN-IR-NEXT: v_ffbh_u32_e32 v4, v0 -; GCN-IR-NEXT: v_add_i32_e64 v4, s[6:7], 32, v4 +; GCN-IR-NEXT: v_add_i32_e32 v4, vcc, 32, v4 ; GCN-IR-NEXT: v_ffbh_u32_e32 v5, v1 ; GCN-IR-NEXT: v_min_u32_e32 v13, v4, v5 -; GCN-IR-NEXT: v_sub_i32_e64 v4, s[6:7], v12, v13 -; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3] -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] -; GCN-IR-NEXT: v_subb_u32_e64 v5, s[6:7], 0, 0, s[6:7] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[6:7], 63, v[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] +; GCN-IR-NEXT: v_sub_i32_e32 v4, vcc, v12, v13 +; GCN-IR-NEXT: v_subb_u32_e64 v5, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_xor_b32_e32 v8, 1, v6 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[4:5] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v7, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v6, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v8, v8, v9 +; GCN-IR-NEXT: v_and_b32_e32 v8, 1, v8 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB1_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 1, v4 @@ -875,16 +891,22 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 ; GCN-IR-NEXT: s_add_u32 s8, s12, 0xffffffc5 ; GCN-IR-NEXT: s_addc_u32 s9, 0, -1 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[6:7], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[10:11], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[14:15], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[6:7], s[10:11] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[10:11], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[6:7] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63 +; GCN-IR-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7] +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-IR-NEXT: v_readfirstlane_b32 s7, v1 +; GCN-IR-NEXT: s_or_b32 s7, s6, s7 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[10:11] +; GCN-IR-NEXT: v_or_b32_e32 v0, s7, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: s_bitcmp1_b32 s7, 0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, 24 -; GCN-IR-NEXT: s_or_b64 s[10:11], s[10:11], s[14:15] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[10:11] ; GCN-IR-NEXT: s_mov_b32 s7, 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 @@ -991,17 +1013,23 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x) ; GCN-IR-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 ; GCN-IR-NEXT: s_waitcnt lgkmcnt(0) ; GCN-IR-NEXT: s_flbit_i32_b64 s12, s[2:3] +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 ; GCN-IR-NEXT: s_sub_u32 s8, 59, s12 ; GCN-IR-NEXT: s_subb_u32 s9, 0, 0 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[2:3], 0 -; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[6:7], s[8:9], 63 -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[10:11], s[8:9], 63 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_b64 s[6:7], s[4:5], exec +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-IR-NEXT: v_cmp_gt_u64_e64 s[4:5], s[8:9], 63 +; GCN-IR-NEXT: v_readfirstlane_b32 s6, v0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-IR-NEXT: v_readfirstlane_b32 s4, v0 +; GCN-IR-NEXT: s_or_b32 s10, s6, s4 +; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], s[8:9], 63 +; GCN-IR-NEXT: s_bitcmp1_b32 s10, 0 +; GCN-IR-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] +; GCN-IR-NEXT: v_or_b32_e32 v0, s10, v0 +; GCN-IR-NEXT: v_and_b32_e32 v0, 1, v0 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GCN-IR-NEXT: s_cselect_b32 s7, 0, s3 ; GCN-IR-NEXT: s_cselect_b32 s6, 0, s2 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], s[10:11] -; GCN-IR-NEXT: s_andn2_b64 vcc, exec, s[4:5] ; GCN-IR-NEXT: s_mov_b64 s[4:5], 0 ; GCN-IR-NEXT: s_cbranch_vccz .LBB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 @@ -1169,17 +1197,24 @@ define i64 @v_test_urem_pow2_k_num_i64(i64 %x) { ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 ; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 0xffffffd0, v10 -; GCN-IR-NEXT: v_addc_u32_e64 v3, s[6:7], 0, -1, vcc -; GCN-IR-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1] +; GCN-IR-NEXT: v_addc_u32_e64 v3, s[4:5], 0, -1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: v_cmp_ne_u64_e64 s[6:7], 63, v[2:3] -; GCN-IR-NEXT: v_mov_b32_e32 v4, 0x8000 -; GCN-IR-NEXT: s_or_b64 s[4:5], s[4:5], vcc -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v4, 0, s[4:5] -; GCN-IR-NEXT: s_xor_b64 s[4:5], s[4:5], -1 +; GCN-IR-NEXT: v_mov_b32_e32 v7, 0x8000 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc +; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 -; GCN-IR-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v6, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v7, 0, vcc +; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB8_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 @@ -1261,20 +1296,27 @@ define i64 @v_test_urem_pow2_k_den_i64(i64 %x) { ; GCN-IR: ; %bb.0: ; %_udiv-special-cases ; GCN-IR-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-IR-NEXT: v_ffbh_u32_e32 v2, v0 -; GCN-IR-NEXT: v_add_i32_e64 v2, s[4:5], 32, v2 +; GCN-IR-NEXT: v_add_i32_e32 v2, vcc, 32, v2 ; GCN-IR-NEXT: v_ffbh_u32_e32 v3, v1 ; GCN-IR-NEXT: v_min_u32_e32 v10, v2, v3 -; GCN-IR-NEXT: v_sub_i32_e64 v2, s[4:5], 48, v10 -; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5] +; GCN-IR-NEXT: v_sub_i32_e32 v2, vcc, 48, v10 +; GCN-IR-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, vcc +; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, 63, v[2:3] +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; GCN-IR-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1] -; GCN-IR-NEXT: v_cmp_lt_u64_e64 s[4:5], 63, v[2:3] -; GCN-IR-NEXT: s_or_b64 s[4:5], vcc, s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; GCN-IR-NEXT: v_or_b32_e32 v4, v5, v4 +; GCN-IR-NEXT: v_xor_b32_e32 v6, 1, v4 +; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v4 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, vcc +; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, vcc ; GCN-IR-NEXT: v_cmp_ne_u64_e32 vcc, 63, v[2:3] -; GCN-IR-NEXT: s_xor_b64 s[6:7], s[4:5], -1 -; GCN-IR-NEXT: v_cndmask_b32_e64 v5, v1, 0, s[4:5] -; GCN-IR-NEXT: v_cndmask_b32_e64 v4, v0, 0, s[4:5] -; GCN-IR-NEXT: s_and_b64 s[4:5], s[6:7], vcc -; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], s[4:5] +; GCN-IR-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc +; GCN-IR-NEXT: v_and_b32_e32 v6, v6, v7 +; GCN-IR-NEXT: v_and_b32_e32 v6, 1, v6 +; GCN-IR-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 +; GCN-IR-NEXT: s_and_saveexec_b64 s[6:7], vcc ; GCN-IR-NEXT: s_cbranch_execz .LBB9_6 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 1, v2 diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll index 2f4f08175be0e..3c4c0239c02cf 100644 --- a/llvm/test/CodeGen/AMDGPU/usubo.ll +++ b/llvm/test/CodeGen/AMDGPU/usubo.ll @@ -628,7 +628,10 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b64 s[0:1], 0 ; SI-NEXT: s_cbranch_scc1 .LBB8_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; SI-NEXT: .LBB8_2: ; %exit ; SI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x9 ; SI-NEXT: s_mov_b32 s3, 0xf000 @@ -655,7 +658,10 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: s_mov_b64 s[0:1], 0 ; VI-NEXT: s_cbranch_scc1 .LBB8_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[0:1], vcc, -1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; VI-NEXT: v_not_b32_e32 v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; VI-NEXT: .LBB8_2: ; %exit ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x24 ; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[0:1] @@ -678,7 +684,10 @@ define amdgpu_kernel void @s_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: s_mov_b64 s[0:1], 0 ; GFX9-NEXT: s_cbranch_scc1 .LBB8_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[0:1], vcc, -1 +; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX9-NEXT: v_not_b32_e32 v1, v1 +; GFX9-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 ; GFX9-NEXT: .LBB8_2: ; %exit ; GFX9-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 @@ -727,7 +736,10 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; SI-NEXT: s_mov_b64 s[8:9], 0 ; SI-NEXT: s_cbranch_vccnz .LBB9_2 ; SI-NEXT: ; %bb.1: ; %if -; SI-NEXT: s_xor_b64 s[8:9], s[0:1], -1 +; SI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; SI-NEXT: v_not_b32_e32 v1, v1 +; SI-NEXT: v_and_b32_e32 v1, 1, v1 +; SI-NEXT: v_cmp_eq_u32_e64 s[8:9], 1, v1 ; SI-NEXT: .LBB9_2: ; %exit ; SI-NEXT: s_mov_b32 s0, s4 ; SI-NEXT: s_mov_b32 s1, s5 @@ -757,7 +769,10 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; VI-NEXT: v_sub_u32_e64 v0, s[0:1], v1, v2 ; VI-NEXT: s_cbranch_vccnz .LBB9_2 ; VI-NEXT: ; %bb.1: ; %if -; VI-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; VI-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; VI-NEXT: v_not_b32_e32 v1, v1 +; VI-NEXT: v_and_b32_e32 v1, 1, v1 +; VI-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v1 ; VI-NEXT: .LBB9_2: ; %exit ; VI-NEXT: v_mov_b32_e32 v1, s4 ; VI-NEXT: v_mov_b32_e32 v2, s5 @@ -781,7 +796,10 @@ define amdgpu_kernel void @v_usubo_clamp_bit(ptr addrspace(1) %out, ptr addrspac ; GFX9-NEXT: v_sub_co_u32_e64 v1, s[0:1], v2, v3 ; GFX9-NEXT: s_cbranch_vccnz .LBB9_2 ; GFX9-NEXT: ; %bb.1: ; %if -; GFX9-NEXT: s_xor_b64 s[2:3], s[0:1], -1 +; GFX9-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; GFX9-NEXT: v_not_b32_e32 v2, v2 +; GFX9-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 1, v2 ; GFX9-NEXT: .LBB9_2: ; %exit ; GFX9-NEXT: global_store_dword v0, v1, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll index a41063f467d01..758450e5acdc4 100644 --- a/llvm/test/CodeGen/AMDGPU/v_cndmask.ll +++ b/llvm/test/CodeGen/AMDGPU/v_cndmask.ll @@ -1543,27 +1543,25 @@ define amdgpu_kernel void @fcmp_k0_vgprX_select_k1_vgprZ_v4f32(ptr addrspace(1) define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %out, ptr addrspace(1) %x.ptr, ptr addrspace(1) %z.ptr) #0 { ; SI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; SI: ; %bb.0: -; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 -; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd -; SI-NEXT: s_mov_b32 s6, 0 +; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0xd +; SI-NEXT: s_mov_b32 s10, 0 ; SI-NEXT: v_mov_b32_e32 v1, 0 -; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s11, 0xf000 ; SI-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; SI-NEXT: v_mov_b32_e32 v3, v1 -; SI-NEXT: s_mov_b64 s[2:3], s[6:7] +; SI-NEXT: s_mov_b64 s[6:7], s[10:11] ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: s_mov_b64 s[4:5], s[10:11] -; SI-NEXT: buffer_load_dword v2, v[2:3], s[4:7], 0 addr64 glc +; SI-NEXT: s_mov_b64 s[8:9], s[2:3] +; SI-NEXT: buffer_load_dword v2, v[2:3], s[8:11], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[0:3], 0 addr64 glc +; SI-NEXT: buffer_load_ubyte v3, v[0:1], s[4:7], 0 addr64 glc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: s_mov_b64 s[10:11], s[6:7] -; SI-NEXT: v_and_b32_e32 v3, 1, v3 -; SI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; SI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 -; SI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] -; SI-NEXT: buffer_store_byte v2, v[0:1], s[8:11], 0 addr64 +; SI-NEXT: s_mov_b64 s[2:3], s[10:11] +; SI-NEXT: v_lshrrev_b32_e32 v2, 31, v2 +; SI-NEXT: v_or_b32_e32 v2, v2, v3 +; SI-NEXT: v_and_b32_e32 v2, 1, v2 +; SI-NEXT: buffer_store_byte v2, v[0:1], s[0:3], 0 addr64 ; SI-NEXT: s_endpgm ; ; VI-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: @@ -1585,75 +1583,67 @@ define amdgpu_kernel void @icmp_vgprX_k0_select_k1_vgprZ_i1(ptr addrspace(1) %ou ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; VI-NEXT: v_and_b32_e32 v3, 1, v3 -; VI-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v3 -; VI-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[0:1] +; VI-NEXT: v_lshrrev_b32_e32 v2, 31, v2 +; VI-NEXT: v_or_b32_e32 v2, v2, v3 +; VI-NEXT: v_and_b32_e32 v2, 1, v2 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; ; GFX10-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x24 -; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x34 +; GFX10-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x34 ; GFX10-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: global_load_dword v2, v1, s[10:11] glc dlc +; GFX10-NEXT: global_load_dword v2, v1, s[2:3] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: global_load_ubyte v3, v0, s[0:1] glc dlc +; GFX10-NEXT: global_load_ubyte v3, v0, s[6:7] glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_cmp_gt_i32_e32 vcc, 0, v2 -; GFX10-NEXT: v_and_b32_e32 v1, 1, v3 -; GFX10-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v1 -; GFX10-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX10-NEXT: global_store_byte v0, v1, s[8:9] +; GFX10-NEXT: v_lshrrev_b32_e32 v1, 31, v2 +; GFX10-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: global_store_byte v0, v1, s[0:1] ; GFX10-NEXT: s_endpgm ; ; GFX11-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_clause 0x1 -; GFX11-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: global_load_b32 v1, v1, s[10:11] glc dlc +; GFX11-NEXT: global_load_b32 v1, v1, s[2:3] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: global_load_u8 v2, v0, s[0:1] glc dlc +; GFX11-NEXT: global_load_u8 v2, v0, s[4:5] glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 -; GFX11-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX11-NEXT: global_store_b8 v0, v1, s[8:9] +; GFX11-NEXT: v_lshrrev_b32_e32 v1, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX11-NEXT: s_endpgm ; ; GFX12-LABEL: icmp_vgprX_k0_select_k1_vgprZ_i1: ; GFX12: ; %bb.0: ; GFX12-NEXT: s_clause 0x1 -; GFX12-NEXT: s_load_b128 s[8:11], s[4:5], 0x24 -; GFX12-NEXT: s_load_b64 s[0:1], s[4:5], 0x34 +; GFX12-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX12-NEXT: s_load_b64 s[4:5], s[4:5], 0x34 ; GFX12-NEXT: v_and_b32_e32 v0, 0x3ff, v0 ; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-NEXT: v_lshlrev_b32_e32 v1, 2, v0 ; GFX12-NEXT: s_wait_kmcnt 0x0 -; GFX12-NEXT: global_load_b32 v1, v1, s[10:11] scope:SCOPE_SYS +; GFX12-NEXT: global_load_b32 v1, v1, s[2:3] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: global_load_u8 v2, v0, s[0:1] scope:SCOPE_SYS +; GFX12-NEXT: global_load_u8 v2, v0, s[4:5] scope:SCOPE_SYS ; GFX12-NEXT: s_wait_loadcnt 0x0 -; GFX12-NEXT: v_cmp_gt_i32_e32 vcc, 0, v1 -; GFX12-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) -; GFX12-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v2 -; GFX12-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX12-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] -; GFX12-NEXT: global_store_b8 v0, v1, s[8:9] +; GFX12-NEXT: v_lshrrev_b32_e32 v1, 31, v1 +; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX12-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX12-NEXT: global_store_b8 v0, v1, s[0:1] ; GFX12-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() #1 %tid.ext = sext i32 %tid to i64 diff --git a/llvm/test/CodeGen/AMDGPU/valu-i1.ll b/llvm/test/CodeGen/AMDGPU/valu-i1.ll index 35cd2663f523f..8e9146a7c9d75 100644 --- a/llvm/test/CodeGen/AMDGPU/valu-i1.ll +++ b/llvm/test/CodeGen/AMDGPU/valu-i1.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs -enable-misched -asm-verbose -disable-block-placement -simplifycfg-require-and-preserve-domtree=1 < %s | FileCheck -check-prefix=SI %s declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone @@ -25,6 +26,96 @@ declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone ; SI-NEXT: s_andn2_saveexec_b64 [[SAVE2]], [[SAVE2]] ; define amdgpu_kernel void @test_if(i32 %b, ptr addrspace(1) %src, ptr addrspace(1) %dst) #1 { +; SI-LABEL: test_if: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dword s8, s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xd +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 1, v0 +; SI-NEXT: s_mov_b64 s[10:11], 0 +; SI-NEXT: s_mov_b64 s[2:3], 0 +; SI-NEXT: s_and_saveexec_b64 s[4:5], vcc +; SI-NEXT: s_xor_b64 s[4:5], exec, s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_3 +; SI-NEXT: ; %bb.1: ; %LeafBlock3 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; SI-NEXT: s_mov_b64 s[2:3], -1 +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SI-NEXT: s_cbranch_execnz .LBB0_9 +; SI-NEXT: .LBB0_2: ; %Flow7 +; SI-NEXT: s_or_b64 exec, exec, s[6:7] +; SI-NEXT: s_and_b64 s[2:3], s[2:3], exec +; SI-NEXT: .LBB0_3: ; %Flow6 +; SI-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5] +; SI-NEXT: s_cbranch_execz .LBB0_5 +; SI-NEXT: ; %bb.4: ; %LeafBlock +; SI-NEXT: s_mov_b64 s[10:11], exec +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 1, v0 +; SI-NEXT: s_andn2_b64 s[2:3], s[2:3], exec +; SI-NEXT: s_and_b64 s[6:7], vcc, exec +; SI-NEXT: s_or_b64 s[2:3], s[2:3], s[6:7] +; SI-NEXT: .LBB0_5: ; %Flow8 +; SI-NEXT: s_or_b64 exec, exec, s[4:5] +; SI-NEXT: s_and_saveexec_b64 s[4:5], s[2:3] +; SI-NEXT: s_xor_b64 s[2:3], exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB0_10 +; SI-NEXT: .LBB0_6: ; %Flow9 +; SI-NEXT: s_or_b64 exec, exec, s[2:3] +; SI-NEXT: s_and_saveexec_b64 s[2:3], s[10:11] +; SI-NEXT: s_cbranch_execz .LBB0_8 +; SI-NEXT: ; %bb.7: ; %case1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_ashr_i32 s9, s8, 31 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_lshl_b64 s[4:5], s[8:9], 2 +; SI-NEXT: v_mov_b32_e32 v2, 13 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: .LBB0_8: ; %end +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB0_9: ; %case2 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_ashr_i32 s9, s8, 31 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: s_lshl_b64 s[12:13], s[8:9], 2 +; SI-NEXT: v_mov_b32_e32 v3, 17 +; SI-NEXT: v_mov_b32_e32 v1, s12 +; SI-NEXT: v_mov_b32_e32 v2, s13 +; SI-NEXT: buffer_store_dword v3, v[1:2], s[0:3], 0 addr64 +; SI-NEXT: s_xor_b64 s[2:3], exec, -1 +; SI-NEXT: s_branch .LBB0_2 +; SI-NEXT: .LBB0_10: ; %default +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 2, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: s_ashr_i32 s9, s8, 31 +; SI-NEXT: s_lshl_b64 s[4:5], s[8:9], 2 +; SI-NEXT: s_add_u32 s4, s0, s4 +; SI-NEXT: s_addc_u32 s5, s1, s5 +; SI-NEXT: s_and_saveexec_b64 s[6:7], vcc +; SI-NEXT: s_xor_b64 s[12:13], exec, s[6:7] +; SI-NEXT: s_cbranch_execnz .LBB0_14 +; SI-NEXT: .LBB0_11: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13] +; SI-NEXT: s_cbranch_execz .LBB0_13 +; SI-NEXT: ; %bb.12: ; %if +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, 19 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: .LBB0_13: ; %Flow5 +; SI-NEXT: s_or_b64 exec, exec, s[12:13] +; SI-NEXT: s_andn2_b64 s[10:11], s[10:11], exec +; SI-NEXT: s_branch .LBB0_6 +; SI-NEXT: .LBB0_14: ; %else +; SI-NEXT: s_mov_b32 s7, 0xf000 +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: v_mov_b32_e32 v0, 21 +; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 +; SI-NEXT: s_branch .LBB0_11 entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone switch i32 %tid, label %default [ @@ -70,6 +161,22 @@ end: ; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_endpgm define amdgpu_kernel void @simple_test_v_if(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { +; SI-LABEL: simple_test_v_if: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_cbranch_execz .LBB1_2 +; SI-NEXT: ; %bb.1: ; %then +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v2, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: .LBB1_2: ; %exit +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 br i1 %is.0, label %then, label %exit @@ -96,6 +203,22 @@ exit: ; SI-NEXT: {{^}}[[EXIT]]: ; SI: s_endpgm define amdgpu_kernel void @simple_test_v_if_ret_else_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { +; SI-LABEL: simple_test_v_if_ret_else_ret: +; SI: ; %bb.0: +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_cbranch_execz .LBB2_2 +; SI-NEXT: ; %bb.1: ; %then +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v2, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: .LBB2_2: ; %UnifiedReturnBlock +; SI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() %is.0 = icmp ne i32 %tid, 0 br i1 %is.0, label %then, label %exit @@ -133,6 +256,32 @@ exit: ; SI-NEXT: {{^}}[[EXIT]]: ; SI: ds_write_b32 define amdgpu_kernel void @simple_test_v_if_ret_else_code_ret(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { +; SI-LABEL: simple_test_v_if_ret_else_code_ret: +; SI: ; %bb.0: +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_xor_b64 s[0:1], exec, s[0:1] +; SI-NEXT: s_cbranch_execnz .LBB3_4 +; SI-NEXT: .LBB3_1: ; %Flow +; SI-NEXT: s_andn2_saveexec_b64 s[0:1], s[0:1] +; SI-NEXT: s_cbranch_execz .LBB3_3 +; SI-NEXT: ; %bb.2: ; %then +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: v_mov_b32_e32 v1, 0 +; SI-NEXT: v_mov_b32_e32 v2, 0x3e7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: .LBB3_3: ; %UnifiedReturnBlock +; SI-NEXT: s_endpgm +; SI-NEXT: .LBB3_4: ; %exit +; SI-NEXT: v_mov_b32_e32 v0, 7 +; SI-NEXT: s_mov_b32 m0, -1 +; SI-NEXT: ds_write_b32 v0, v0 +; SI-NEXT: ; implicit-def: $vgpr0 +; SI-NEXT: s_branch .LBB3_1 %tid = call i32 @llvm.amdgcn.workitem.id.x() %is.0 = icmp ne i32 %tid, 0 br i1 %is.0, label %then, label %exit @@ -162,6 +311,37 @@ exit: ; SI: [[LABEL_EXIT]]: ; SI: s_endpgm define amdgpu_kernel void @simple_test_v_loop(ptr addrspace(1) %dst, ptr addrspace(1) %src) #1 { +; SI-LABEL: simple_test_v_loop: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_cbranch_execz .LBB4_3 +; SI-NEXT: ; %bb.1: ; %loop.preheader +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; SI-NEXT: s_mov_b64 s[0:1], 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v1, s9 +; SI-NEXT: v_add_i32_e32 v0, vcc, s8, v0 +; SI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc +; SI-NEXT: s_mov_b32 s6, -1 +; SI-NEXT: s_mov_b32 s4, s10 +; SI-NEXT: s_mov_b32 s5, s11 +; SI-NEXT: s_mov_b32 s7, s3 +; SI-NEXT: .LBB4_2: ; %loop +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v2, off, s[4:7], 0 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 +; SI-NEXT: s_add_u32 s0, s0, 4 +; SI-NEXT: s_addc_u32 s1, s1, 0 +; SI-NEXT: s_cmpk_lg_i32 s0, 0x100 +; SI-NEXT: s_cbranch_scc1 .LBB4_2 +; SI-NEXT: .LBB4_3: ; %exit +; SI-NEXT: s_endpgm entry: %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %is.0 = icmp ne i32 %tid, 0 @@ -221,6 +401,80 @@ exit: ; SI-NOT: [[COND_STATE]] ; SI: s_endpgm define amdgpu_kernel void @multi_vcond_loop(ptr addrspace(1) noalias nocapture %arg, ptr addrspace(1) noalias nocapture readonly %arg1, ptr addrspace(1) noalias nocapture readonly %arg2, ptr addrspace(1) noalias nocapture readonly %arg3) #1 { +; SI-LABEL: multi_vcond_loop: +; SI: ; %bb.0: ; %bb +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0xf +; SI-NEXT: s_mov_b32 s2, 0 +; SI-NEXT: v_mov_b32_e32 v7, 0 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: v_lshlrev_b32_e32 v6, 2, v0 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: buffer_load_dword v0, v[6:7], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_lt_i32_e32 vcc, 0, v0 +; SI-NEXT: s_and_saveexec_b64 s[0:1], vcc +; SI-NEXT: s_cbranch_execz .LBB5_5 +; SI-NEXT: ; %bb.1: ; %bb10.preheader +; SI-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x9 +; SI-NEXT: s_load_dwordx2 s[12:13], s[4:5], 0xd +; SI-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; SI-NEXT: s_mov_b64 s[4:5], 0 +; SI-NEXT: s_mov_b32 s0, s2 +; SI-NEXT: s_mov_b32 s1, s2 +; SI-NEXT: ; implicit-def: $sgpr6_sgpr7 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v3, s9 +; SI-NEXT: v_add_i32_e32 v2, vcc, s8, v6 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_mov_b32_e32 v5, s13 +; SI-NEXT: v_add_i32_e32 v4, vcc, s12, v6 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_mov_b32_e32 v7, s11 +; SI-NEXT: v_add_i32_e32 v6, vcc, s10, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: s_mov_b64 s[8:9], 0 +; SI-NEXT: .LBB5_2: ; %bb10 +; SI-NEXT: ; =>This Inner Loop Header: Depth=1 +; SI-NEXT: s_waitcnt expcnt(0) +; SI-NEXT: buffer_load_dword v8, v[4:5], s[0:3], 0 addr64 +; SI-NEXT: buffer_load_dword v9, v[6:7], s[0:3], 0 addr64 +; SI-NEXT: s_waitcnt vmcnt(1) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v8 +; SI-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc +; SI-NEXT: s_waitcnt vmcnt(0) +; SI-NEXT: v_cmp_ne_u32_e32 vcc, -1, v9 +; SI-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc +; SI-NEXT: v_and_b32_e32 v10, v11, v10 +; SI-NEXT: v_and_b32_e32 v10, 1, v10 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v10 +; SI-NEXT: s_or_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_and_saveexec_b64 s[10:11], vcc +; SI-NEXT: s_cbranch_execz .LBB5_4 +; SI-NEXT: ; %bb.3: ; %bb20 +; SI-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; SI-NEXT: v_add_i32_e32 v8, vcc, v8, v9 +; SI-NEXT: s_add_u32 s8, s8, 1 +; SI-NEXT: v_add_i32_e32 v4, vcc, 4, v4 +; SI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc +; SI-NEXT: v_add_i32_e32 v6, vcc, 4, v6 +; SI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc +; SI-NEXT: buffer_store_dword v8, v[2:3], s[0:3], 0 addr64 +; SI-NEXT: s_addc_u32 s9, s9, 0 +; SI-NEXT: v_add_i32_e32 v2, vcc, 4, v2 +; SI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc +; SI-NEXT: v_cmp_ge_i64_e32 vcc, s[8:9], v[0:1] +; SI-NEXT: s_andn2_b64 s[6:7], s[6:7], exec +; SI-NEXT: s_and_b64 s[12:13], vcc, exec +; SI-NEXT: s_or_b64 s[6:7], s[6:7], s[12:13] +; SI-NEXT: .LBB5_4: ; %Flow +; SI-NEXT: ; in Loop: Header=BB5_2 Depth=1 +; SI-NEXT: s_or_b64 exec, exec, s[10:11] +; SI-NEXT: s_and_b64 s[10:11], exec, s[6:7] +; SI-NEXT: s_or_b64 s[4:5], s[10:11], s[4:5] +; SI-NEXT: s_andn2_b64 exec, exec, s[4:5] +; SI-NEXT: s_cbranch_execnz .LBB5_2 +; SI-NEXT: .LBB5_5: ; %bb26 +; SI-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() #0 %tmp4 = sext i32 %tmp to i64 diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll index 801324eec454e..41973bf822d0c 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-and.ll @@ -994,45 +994,28 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) { ; GFX8-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: test_vector_reduce_and_v2i16: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_alignbit_b32 v1, s0, v0, 16 -; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: test_vector_reduce_and_v2i16: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: test_vector_reduce_and_v2i16: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_alignbit_b32 v1, s4, v0, 16 -; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_vector_reduce_and_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: test_vector_reduce_and_v2i16: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: test_vector_reduce_and_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v2i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -1052,10 +1035,7 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v2i16: @@ -1065,7 +1045,7 @@ define i16 @test_vector_reduce_and_v2i16(<2 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -1126,11 +1106,9 @@ define i16 @test_vector_reduce_and_v3i16(<3 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v3i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v3i16: @@ -1158,11 +1136,9 @@ define i16 @test_vector_reduce_and_v3i16(<3 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v3i16: @@ -1223,8 +1199,9 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_and_v4i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_and_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_and_v4i16: @@ -1237,37 +1214,54 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) { ; GFX8-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_vector_reduce_and_v4i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: test_vector_reduce_and_v4i16: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_and_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: test_vector_reduce_and_v4i16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: test_vector_reduce_and_v4i16: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: test_vector_reduce_and_v4i16: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_and_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_vector_reduce_and_v4i16: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v4i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v4i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_and_v4i16: @@ -1286,11 +1280,10 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v4i16: @@ -1300,10 +1293,12 @@ define i16 @test_vector_reduce_and_v4i16(<4 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v3, v2 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_and_v4i16: @@ -1362,10 +1357,13 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_and_v8i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_and_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_and_v8i16: @@ -1380,10 +1378,13 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_and_v8i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_and_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_and_v8i16: @@ -1398,10 +1399,13 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_and_v8i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_and_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_and_v8i16: @@ -1416,26 +1420,33 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v8i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v8i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, v5, v4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, v7, v6 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v4, v3 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_and_v8i16: @@ -1457,14 +1468,15 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, v1.h, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v2.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v8i16: @@ -1474,13 +1486,19 @@ define i16 @test_vector_reduce_and_v8i16(<8 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, v5, v4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, v7, v6 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v4, v3 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_and_v8i16: @@ -1566,14 +1584,21 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_and_v16i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_and_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX8-SDAG-NEXT: v_and_b32_e32 v3, v3, v7 ; GFX8-SDAG-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_and_b32_e32 v4, v11, v10 +; GFX8-SDAG-NEXT: v_and_b32_e32 v5, v9, v8 ; GFX8-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_and_v16i16: @@ -1592,14 +1617,21 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_and_v16i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_and_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v4 ; GFX9-SDAG-NEXT: v_and_b32_e32 v3, v3, v7 ; GFX9-SDAG-NEXT: v_and_b32_e32 v1, v1, v5 +; GFX9-SDAG-NEXT: v_and_b32_e32 v4, v11, v10 +; GFX9-SDAG-NEXT: v_and_b32_e32 v5, v9, v8 ; GFX9-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_and_v16i16: @@ -1618,14 +1650,21 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_and_v16i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_and_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_and_b32_e32 v2, v2, v6 ; GFX10-SDAG-NEXT: v_and_b32_e32 v3, v3, v7 ; GFX10-SDAG-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v4 +; GFX10-SDAG-NEXT: v_and_b32_e32 v4, v11, v10 +; GFX10-SDAG-NEXT: v_and_b32_e32 v5, v9, v8 ; GFX10-SDAG-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: v_and_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_and_v16i16: @@ -1644,36 +1683,55 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_and_v16i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, v3, v7 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, v1, v5 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.h, v2.h, v6.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v4.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.h, v3.h, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, v1.h, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v2.l, v2.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v3.l, v3.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.h, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_and_v16i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v8, v9, v8 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v9, v11, v10 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, v13, v12 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, v15, v14 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, v3, v7 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, v9, v8 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, v11, v10 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_and_v16i16: @@ -1700,19 +1758,24 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v2, v2, v6 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v3, v3, v7 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, v1, v5 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v1, v1, v3 -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.h, v2.h, v6.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v4.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.h, v3.h, v7.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, v1.h, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v2.l, v2.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v3.l, v3.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, v1.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.h, v1.h, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v2.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_and_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_and_v16i16: @@ -1722,18 +1785,32 @@ define i16 @test_vector_reduce_and_v16i16(<16 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v8, v9, v8 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v9, v11, v10 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v10, v13, v12 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v2, v6 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v11, v15, v14 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v3, v3, v7 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, v1, v5 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v4 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v5, v9, v8 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v4, v11, v10 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v1, v1, v3 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v2, v5, v4 ; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_and_v16i16: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll index bdb1c22ce7267..1be49e24e4d71 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-or.ll @@ -1017,45 +1017,28 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) { ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: test_vector_reduce_or_v2i16: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_alignbit_b32 v1, s0, v0, 16 -; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: test_vector_reduce_or_v2i16: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: test_vector_reduce_or_v2i16: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_alignbit_b32 v1, s4, v0, 16 -; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_vector_reduce_or_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: test_vector_reduce_or_v2i16: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: test_vector_reduce_or_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v2i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -1075,10 +1058,7 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v2i16: @@ -1088,7 +1068,7 @@ define i16 @test_vector_reduce_or_v2i16(<2 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -1136,35 +1116,49 @@ define i16 @test_vector_reduce_or_v3i16(<3 x i16> %v) { ; GFX8-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_vector_reduce_or_v3i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX9-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: test_vector_reduce_or_v3i16: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: test_vector_reduce_or_v3i16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: test_vector_reduce_or_v3i16: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX9-GISEL-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: test_vector_reduce_or_v3i16: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_vector_reduce_or_v3i16: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-GISEL-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v3i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v3i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_or_v3i16: @@ -1182,10 +1176,9 @@ define i16 @test_vector_reduce_or_v3i16(<3 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v3i16: @@ -1196,8 +1189,9 @@ define i16 @test_vector_reduce_or_v3i16(<3 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_or_v3i16: @@ -1244,8 +1238,9 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_or_v4i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_or_v4i16: @@ -1258,37 +1253,54 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) { ; GFX8-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_vector_reduce_or_v4i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: test_vector_reduce_or_v4i16: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: test_vector_reduce_or_v4i16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: test_vector_reduce_or_v4i16: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: test_vector_reduce_or_v4i16: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_or_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_vector_reduce_or_v4i16: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v4i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_or_v4i16: @@ -1307,11 +1319,10 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v4i16: @@ -1321,10 +1332,12 @@ define i16 @test_vector_reduce_or_v4i16(<4 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v3, v2 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_or_v4i16: @@ -1383,10 +1396,13 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_or_v8i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_or_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v5, v4 ; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_or_v8i16: @@ -1401,9 +1417,13 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_or_v8i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_or_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX9-SDAG-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX9-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_or_v8i16: @@ -1418,9 +1438,13 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_or_v8i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_or_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX10-SDAG-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_or_v8i16: @@ -1435,24 +1459,33 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v8i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_or_v8i16: @@ -1474,13 +1507,15 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v8i16: @@ -1490,12 +1525,19 @@ define i16 @test_vector_reduce_or_v8i16(<8 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v3, v5, v4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, v7, v6 ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v4, v3 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_or_v8i16: @@ -1581,14 +1623,21 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_or_v16i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_or_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 ; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_or_b32_e32 v4, v11, v10 +; GFX8-SDAG-NEXT: v_or_b32_e32 v5, v9, v8 ; GFX8-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_or_b32_e32 v2, v5, v4 ; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_or_v16i16: @@ -1607,12 +1656,21 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_or_v16i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX9-SDAG-NEXT: v_or_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 ; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX9-SDAG-NEXT: v_or3_b32 v1, v1, v5, v3 -; GFX9-SDAG-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX9-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX9-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 +; GFX9-SDAG-NEXT: v_or_b32_e32 v4, v11, v10 +; GFX9-SDAG-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX9-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_or_v16i16: @@ -1631,12 +1689,21 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_or_v16i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX10-SDAG-NEXT: v_or_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX10-SDAG-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX10-SDAG-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX10-SDAG-NEXT: v_or3_b32 v1, v1, v5, v3 -; GFX10-SDAG-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX10-SDAG-NEXT: v_or_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v4, v11, v10 +; GFX10-SDAG-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX10-SDAG-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_or_b32_e32 v2, v5, v4 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_or_v16i16: @@ -1655,30 +1722,55 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_or_v16i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_or3_b32 v1, v1, v5, v3 -; GFX11-SDAG-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v6.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v10, v13, v12 ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v11, v15, v14 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_or3_b32 v1, v1, v5, v3 -; GFX11-SDAG-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, v11, v10 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v5, v4 ; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_or_v16i16: @@ -1705,16 +1797,24 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v3, v3, v7 -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v2, v2, v6 -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_or3_b32 v1, v1, v5, v3 -; GFX12-SDAG-TRUE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v2.h, v2.h, v6.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v4.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v3.h, v3.h, v7.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v2.l, v2.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v3.l, v3.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.h, v1.h, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v2.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_or_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_or_v16i16: @@ -1724,15 +1824,32 @@ define i16 @test_vector_reduce_or_v16i16(<16 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v8, v9, v8 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v9, v11, v10 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v10, v13, v12 ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v2, v6 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v11, v15, v14 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v3, v3, v7 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v5 ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v4 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_or3_b32 v1, v1, v5, v3 -; GFX12-SDAG-FAKE16-NEXT: v_or3_b32 v0, v0, v2, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v5, v9, v8 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v4, v11, v10 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v2, v5, v4 ; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_or_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_or_v16i16: diff --git a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll index cf344ea9b92d4..96d0f28d82bd0 100644 --- a/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll +++ b/llvm/test/CodeGen/AMDGPU/vector-reduce-xor.ll @@ -963,45 +963,28 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) { ; GFX8-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-SDAG-LABEL: test_vector_reduce_xor_v2i16: -; GFX9-SDAG: ; %bb.0: ; %entry -; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-SDAG-NEXT: v_alignbit_b32 v1, s0, v0, 16 -; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] -; -; GFX9-GISEL-LABEL: test_vector_reduce_xor_v2i16: -; GFX9-GISEL: ; %bb.0: ; %entry -; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] -; -; GFX10-SDAG-LABEL: test_vector_reduce_xor_v2i16: -; GFX10-SDAG: ; %bb.0: ; %entry -; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_alignbit_b32 v1, s4, v0, 16 -; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; GFX9-LABEL: test_vector_reduce_xor_v2i16: +; GFX9: ; %bb.0: ; %entry +; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-GISEL-LABEL: test_vector_reduce_xor_v2i16: -; GFX10-GISEL: ; %bb.0: ; %entry -; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] +; GFX10-LABEL: test_vector_reduce_xor_v2i16: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v2i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -1021,10 +1004,7 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_lshl_or_b32 v1, s0, 16, v1 -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v2i16: @@ -1034,7 +1014,7 @@ define i16 @test_vector_reduce_xor_v2i16(<2 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_alignbit_b32 v1, s0, v0, 16 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] @@ -1089,28 +1069,35 @@ define i16 @test_vector_reduce_xor_v3i16(<3 x i16> %v) { ; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: test_vector_reduce_xor_v3i16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX10-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX10-SDAG-LABEL: test_vector_reduce_xor_v3i16: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_vector_reduce_xor_v3i16: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 +; GFX10-GISEL-NEXT: v_xor3_b32 v0, v0, v2, v1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v3i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_xor3_b32 v0, v0, v2, v1 +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v3i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_xor3_b32 v0, v0, v2, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v3i16: @@ -1128,10 +1115,9 @@ define i16 @test_vector_reduce_xor_v3i16(<3 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v2.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_xor3_b32 v0, v0, v2, v1 +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v3i16: @@ -1142,8 +1128,9 @@ define i16 @test_vector_reduce_xor_v3i16(<3 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_xor3_b32 v0, v0, v2, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v3i16: @@ -1190,8 +1177,9 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_xor_v4i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v4i16: @@ -1204,37 +1192,54 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) { ; GFX8-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 ; GFX8-GISEL-NEXT: s_setpc_b64 s[30:31] ; -; GFX9-LABEL: test_vector_reduce_xor_v4i16: -; GFX9: ; %bb.0: ; %entry -; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: s_setpc_b64 s[30:31] +; GFX9-SDAG-LABEL: test_vector_reduce_xor_v4i16: +; GFX9-SDAG: ; %bb.0: ; %entry +; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; -; GFX10-LABEL: test_vector_reduce_xor_v4i16: -; GFX10: ; %bb.0: ; %entry -; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX10-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX10-NEXT: s_setpc_b64 s[30:31] +; GFX9-GISEL-LABEL: test_vector_reduce_xor_v4i16: +; GFX9-GISEL: ; %bb.0: ; %entry +; GFX9-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-GISEL-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-SDAG-LABEL: test_vector_reduce_xor_v4i16: +; GFX10-SDAG: ; %bb.0: ; %entry +; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v2, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] +; +; GFX10-GISEL-LABEL: test_vector_reduce_xor_v4i16: +; GFX10-GISEL: ; %bb.0: ; %entry +; GFX10-GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-GISEL-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-GISEL-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-GISEL-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v4i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v4i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v3, v2 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v4i16: @@ -1253,11 +1258,10 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v4i16: @@ -1267,10 +1271,12 @@ define i16 @test_vector_reduce_xor_v4i16(<4 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v2, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v3, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v3, v2 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v4i16: @@ -1329,10 +1335,13 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_xor_v8i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v2, v5, v4 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v8i16: @@ -1347,10 +1356,13 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_xor_v8i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v5, v4 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v8i16: @@ -1365,9 +1377,13 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_xor_v8i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v4, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v5, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX10-SDAG-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX10-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v5, v4 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v8i16: @@ -1382,24 +1398,33 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v8i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX11-SDAG-TRUE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l ; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v8i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v3, v5, v4 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v4, v7, v6 ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v4, v3 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v8i16: @@ -1421,13 +1446,15 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) -; GFX12-SDAG-TRUE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, v1.h, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v2.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l ; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v8i16: @@ -1437,12 +1464,19 @@ define i16 @test_vector_reduce_xor_v8i16(<8 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v4, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v5, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v6, 16, v2 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v7, 16, v0 ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v3, v5, v4 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v4, v7, v6 ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v4, v3 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v8i16: @@ -1528,14 +1562,21 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX8-SDAG-LABEL: test_vector_reduce_xor_v16i16: ; GFX8-SDAG: ; %bb.0: ; %entry ; GFX8-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v2, v2, v6 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v3, v3, v7 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v4, v11, v10 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v5, v9, v8 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v2, v5, v4 ; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX8-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX8-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX8-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-GISEL-LABEL: test_vector_reduce_xor_v16i16: @@ -1554,14 +1595,21 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX9-SDAG-LABEL: test_vector_reduce_xor_v16i16: ; GFX9-SDAG: ; %bb.0: ; %entry ; GFX9-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v2, v6 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v4 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v3, v3, v7 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v4, v11, v10 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v5, v9, v8 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v2, v5, v4 ; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 -; GFX9-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX9-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-GISEL-LABEL: test_vector_reduce_xor_v16i16: @@ -1580,12 +1628,21 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX10-SDAG-LABEL: test_vector_reduce_xor_v16i16: ; GFX10-SDAG: ; %bb.0: ; %entry ; GFX10-SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SDAG-NEXT: v_xor_b32_e32 v3, v3, v7 +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v8, v2, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v9, v0, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v10, v3, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_sdwa v11, v1, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1 ; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v3, v3, v7 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX10-SDAG-NEXT: v_xor3_b32 v1, v1, v5, v3 -; GFX10-SDAG-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX10-SDAG-NEXT: v_xor_b32_sdwa v0, v0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v4, v11, v10 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v5, v9, v8 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v2, v5, v4 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-SDAG-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX10-SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-GISEL-LABEL: test_vector_reduce_xor_v16i16: @@ -1604,30 +1661,55 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX11-SDAG-TRUE16-LABEL: test_vector_reduce_xor_v16i16: ; GFX11-SDAG-TRUE16: ; %bb.0: ; %entry ; GFX11-SDAG-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v3, v3, v7 -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_xor3_b32 v1, v1, v5, v3 -; GFX11-SDAG-TRUE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX11-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, v2.h, v6.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v4.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v3.h, v3.h, v7.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, v1.h, v5.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v2.l, v2.l, v6.l +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v3.l, v3.l, v7.l +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, v1.l, v5.l +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v4.l +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, v1.h, v3.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v2.h +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, v1.l, v3.l +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v2.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v1.h +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l +; GFX11-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX11-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v16i16: ; GFX11-SDAG-FAKE16: ; %bb.0: ; %entry ; GFX11-SDAG-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v3, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v8, v9, v8 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v9, v11, v10 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v10, v13, v12 ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v11, v15, v14 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v3, v3, v7 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_xor3_b32 v1, v1, v5, v3 -; GFX11-SDAG-FAKE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v5, v9, v8 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v4, v11, v10 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v5, v4 ; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX11-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX11-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-GISEL-LABEL: test_vector_reduce_xor_v16i16: @@ -1654,16 +1736,24 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX12-SDAG-TRUE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v3, v3, v7 -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v2, v2, v6 -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_xor3_b32 v1, v1, v5, v3 -; GFX12-SDAG-TRUE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.h, 0 -; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-TRUE16-NEXT: v_mov_b16_e32 v1.l, v0.h -; GFX12-SDAG-TRUE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v2.h, v2.h, v6.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v4.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v3.h, v3.h, v7.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, v1.h, v5.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v2.l, v2.l, v6.l +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v3.l, v3.l, v7.l +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, v1.l, v5.l +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v4.l +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v1.h, v1.h, v3.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v2.h +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v1.l, v1.l, v3.l +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v2.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.h, v0.h, v1.h +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v1.l +; GFX12-SDAG-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-TRUE16-NEXT: v_xor_b16 v0.l, v0.l, v0.h ; GFX12-SDAG-TRUE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-SDAG-FAKE16-LABEL: test_vector_reduce_xor_v16i16: @@ -1673,15 +1763,32 @@ define i16 @test_vector_reduce_xor_v16i16(<16 x i16> %v) { ; GFX12-SDAG-FAKE16-NEXT: s_wait_samplecnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_bvhcnt 0x0 ; GFX12-SDAG-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v3, v3, v7 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v8, 16, v6 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v10, 16, v4 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v11, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v12, 16, v7 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v13, 16, v3 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v14, 16, v5 +; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v15, 16, v1 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v8, v9, v8 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v9, v11, v10 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v10, v13, v12 ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v2, v6 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v11, v15, v14 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v3, v3, v7 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, v1, v5 ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v4 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_xor3_b32 v1, v1, v5, v3 -; GFX12-SDAG-FAKE16-NEXT: v_xor3_b32 v0, v0, v2, v1 -; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX12-SDAG-FAKE16-NEXT: v_lshrrev_b32_e32 v1, 16, v0 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v5, v9, v8 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v4, v11, v10 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v1, v1, v3 +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v2, v5, v4 ; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX12-SDAG-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX12-SDAG-FAKE16-NEXT: v_xor_b32_e32 v0, v0, v2 ; GFX12-SDAG-FAKE16-NEXT: s_setpc_b64 s[30:31] ; ; GFX12-GISEL-LABEL: test_vector_reduce_xor_v16i16: diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll index d496634ae474f..36371aff7de7b 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll @@ -34,16 +34,13 @@ define <2 x i64> @test_add2x64(ptr %a_ptr, ptr %b_ptr) { ; CHECK-LABEL: test_add2x64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3] -; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; CHECK-NEXT: flat_load_dword v4, v[0:1] +; CHECK-NEXT: flat_load_dword v5, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 48 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v1, v5, v7 -; CHECK-NEXT: v_or_b32_e32 v0, v4, v6 +; CHECK-NEXT: v_or_b32_e32 v0, v4, v5 ; CHECK-NEXT: s_setpc_b64 s[30:31] %a = load <2 x i64>, ptr %a_ptr, !range !4, !noundef !{} %b = load <2 x i64>, ptr %b_ptr, !range !5, !noundef !{} @@ -88,16 +85,13 @@ define <3 x i64> @test_add3x64(ptr %a_ptr, ptr %b_ptr) { ; CHECK-LABEL: test_add3x64: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dwordx4 v[6:9], v[2:3] -; CHECK-NEXT: ; kill: killed $vgpr2 killed $vgpr3 -; CHECK-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; CHECK-NEXT: flat_load_dword v4, v[0:1] +; CHECK-NEXT: flat_load_dword v5, v[2:3] +; CHECK-NEXT: v_mov_b32_e32 v1, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 48 ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v1, v5, v7 -; CHECK-NEXT: v_or_b32_e32 v0, v4, v6 +; CHECK-NEXT: v_or_b32_e32 v0, v4, v5 ; CHECK-NEXT: v_mov_b32_e32 v4, 48 ; CHECK-NEXT: v_mov_b32_e32 v5, 0 ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll index b32ce6eb0acc0..7fe945ae6ed26 100644 --- a/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll +++ b/llvm/test/CodeGen/AMDGPU/waitcnt-looptest.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=amdgcn--amdhsa -mcpu=fiji -mattr=-flat-for-global -amdgpu-load-store-vectorizer=0 | FileCheck --check-prefix=GCN %s ; Check that the waitcnt insertion algorithm correctly propagates wait counts @@ -16,6 +17,166 @@ @data_reference = addrspace(1) global [100 x float] [float 0.000000e+00, float 0x3FB99999A0000000, float 0x3FC99999A0000000, float 0x3FD3333340000000, float 0x3FD99999A0000000, float 5.000000e-01, float 0x3FE3333340000000, float 0x3FE6666660000000, float 0x3FE99999A0000000, float 0x3FECCCCCC0000000, float 1.000000e+00, float 0x3FF19999A0000000, float 0x3FF3333340000000, float 0x3FF4CCCCC0000000, float 0x3FF6666660000000, float 1.500000e+00, float 0x3FF99999A0000000, float 0x3FFB333340000000, float 0x3FFCCCCCC0000000, float 0x3FFE666660000000, float 2.000000e+00, float 0x4000CCCCC0000000, float 0x40019999A0000000, float 0x4002666660000000, float 0x4003333340000000, float 2.500000e+00, float 0x4004CCCCC0000000, float 0x40059999A0000000, float 0x4006666660000000, float 0x4007333340000000, float 3.000000e+00, float 0x4008CCCCC0000000, float 0x40099999A0000000, float 0x400A666660000000, float 0x400B333340000000, float 3.500000e+00, float 0x400CCCCCC0000000, float 0x400D9999A0000000, float 0x400E666660000000, float 0x400F333340000000, float 4.000000e+00, float 0x4010666660000000, float 0x4010CCCCC0000000, float 0x4011333340000000, float 0x40119999A0000000, float 4.500000e+00, float 0x4012666660000000, float 0x4012CCCCC0000000, float 0x4013333340000000, float 0x40139999A0000000, float 5.000000e+00, float 0x4014666660000000, float 0x4014CCCCC0000000, float 0x4015333340000000, float 0x40159999A0000000, float 5.500000e+00, float 0x4016666660000000, float 0x4016CCCCC0000000, float 0x4017333340000000, float 0x40179999A0000000, float 6.000000e+00, float 0x4018666660000000, float 0x4018CCCCC0000000, float 0x4019333340000000, float 0x40199999A0000000, float 6.500000e+00, float 0x401A666660000000, float 0x401ACCCCC0000000, float 0x401B333340000000, float 0x401B9999A0000000, float 7.000000e+00, float 0x401C666660000000, float 0x401CCCCCC0000000, float 0x401D333340000000, float 0x401D9999A0000000, float 7.500000e+00, float 0x401E666660000000, float 0x401ECCCCC0000000, float 0x401F333340000000, float 0x401F9999A0000000, float 8.000000e+00, float 0x4020333340000000, float 0x4020666660000000, float 0x40209999A0000000, float 0x4020CCCCC0000000, float 8.500000e+00, float 0x4021333340000000, float 0x4021666660000000, float 0x40219999A0000000, float 0x4021CCCCC0000000, float 9.000000e+00, float 0x4022333340000000, float 0x4022666660000000, float 0x40229999A0000000, float 0x4022CCCCC0000000, float 9.500000e+00, float 0x4023333340000000, float 0x4023666660000000, float 0x40239999A0000000, float 0x4023CCCCC0000000], align 4 define amdgpu_kernel void @testKernel(ptr addrspace(1) nocapture %arg) local_unnamed_addr #0 { +; GCN-LABEL: testKernel: +; GCN: ; %bb.0: ; %bb +; GCN-NEXT: s_add_i32 s12, s12, s17 +; GCN-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8 +; GCN-NEXT: s_getpc_b64 s[0:1] +; GCN-NEXT: s_add_u32 s0, s0, data_generic@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s1, s1, data_generic@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 +; GCN-NEXT: s_getpc_b64 s[10:11] +; GCN-NEXT: s_add_u32 s10, s10, data_reference@gotpcrel32@lo+4 +; GCN-NEXT: s_addc_u32 s11, s11, data_reference@gotpcrel32@hi+12 +; GCN-NEXT: s_load_dwordx2 s[16:17], s[10:11], 0x0 +; GCN-NEXT: s_mov_b32 s6, 1.0 +; GCN-NEXT: s_mov_b32 s7, s6 +; GCN-NEXT: v_mov_b32_e32 v1, s6 +; GCN-NEXT: s_mov_b32 s3, 0x1100f000 +; GCN-NEXT: s_mov_b32 s2, -1 +; GCN-NEXT: v_mov_b32_e32 v2, s7 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: buffer_store_dwordx2 v[1:2], off, s[0:3], 0 offset:16 +; GCN-NEXT: s_mov_b32 s18, s2 +; GCN-NEXT: s_mov_b32 s19, s3 +; GCN-NEXT: s_mov_b64 s[6:7], -1 +; GCN-NEXT: s_mov_b32 s15, 0 +; GCN-NEXT: s_mov_b64 s[2:3], 0 +; GCN-NEXT: s_mov_b32 flat_scratch_lo, s13 +; GCN-NEXT: buffer_store_dwordx2 v[1:2], off, s[16:19], 0 offset:16 +; GCN-NEXT: .LBB0_1: ; %bb18 +; GCN-NEXT: ; =>This Inner Loop Header: Depth=1 +; GCN-NEXT: s_add_u32 s10, s0, s2 +; GCN-NEXT: s_addc_u32 s11, s1, s3 +; GCN-NEXT: s_add_u32 s12, s16, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s10 +; GCN-NEXT: s_addc_u32 s13, s17, s3 +; GCN-NEXT: s_or_b32 s18, s15, 1 +; GCN-NEXT: v_mov_b32_e32 v2, s11 +; GCN-NEXT: s_ashr_i32 s19, s18, 31 +; GCN-NEXT: flat_load_dword v1, v[1:2] +; GCN-NEXT: v_mov_b32_e32 v2, s12 +; GCN-NEXT: s_lshl_b64 s[18:19], s[18:19], 2 +; GCN-NEXT: v_mov_b32_e32 v3, s13 +; GCN-NEXT: s_add_u32 s20, s0, s18 +; GCN-NEXT: flat_load_dword v4, v[2:3] +; GCN-NEXT: s_addc_u32 s21, s1, s19 +; GCN-NEXT: v_mov_b32_e32 v2, s20 +; GCN-NEXT: v_mov_b32_e32 v3, s21 +; GCN-NEXT: s_add_u32 s18, s16, s18 +; GCN-NEXT: flat_load_dword v5, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s17, s19 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s10, 8 +; GCN-NEXT: flat_load_dword v6, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s11, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s12, 8 +; GCN-NEXT: flat_load_dword v7, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s13, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s10, 12 +; GCN-NEXT: flat_load_dword v8, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s11, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s12, 12 +; GCN-NEXT: flat_load_dword v9, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s13, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s10, 16 +; GCN-NEXT: flat_load_dword v10, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s11, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s12, 16 +; GCN-NEXT: flat_load_dword v11, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s13, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s10, 20 +; GCN-NEXT: flat_load_dword v12, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s11, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: s_add_u32 s18, s12, 20 +; GCN-NEXT: flat_load_dword v13, v[2:3] +; GCN-NEXT: s_addc_u32 s19, s13, 0 +; GCN-NEXT: v_mov_b32_e32 v2, s18 +; GCN-NEXT: v_mov_b32_e32 v3, s19 +; GCN-NEXT: flat_load_dword v2, v[2:3] +; GCN-NEXT: s_load_dwordx4 s[20:23], s[10:11], 0x18 +; GCN-NEXT: s_load_dwordx4 s[24:27], s[12:13], 0x18 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[6:7] +; GCN-NEXT: s_add_i32 s15, s15, 10 +; GCN-NEXT: s_add_u32 s2, s2, 40 +; GCN-NEXT: s_addc_u32 s3, s3, 0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v14, s24 +; GCN-NEXT: v_mov_b32_e32 v15, s25 +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, s20, v14 +; GCN-NEXT: v_mov_b32_e32 v16, s26 +; GCN-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, s21, v15 +; GCN-NEXT: v_mov_b32_e32 v17, s27 +; GCN-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, s22, v16 +; GCN-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, s23, v17 +; GCN-NEXT: v_cndmask_b32_e64 v17, 0, 1, vcc +; GCN-NEXT: s_cmpk_eq_i32 s2, 0x190 +; GCN-NEXT: s_waitcnt vmcnt(10) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v1, v4 +; GCN-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v1, v3, v1 +; GCN-NEXT: s_waitcnt vmcnt(8) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v5, v6 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN-NEXT: s_waitcnt vmcnt(6) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v7, v8 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN-NEXT: s_waitcnt vmcnt(4) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v9, v10 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN-NEXT: s_waitcnt vmcnt(2) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v11, v12 +; GCN-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v1, v1, v3 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: v_cmp_eq_f32_e32 vcc, v13, v2 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc +; GCN-NEXT: v_and_b32_e32 v1, v1, v2 +; GCN-NEXT: v_and_b32_e32 v1, v1, v14 +; GCN-NEXT: v_and_b32_e32 v1, v1, v15 +; GCN-NEXT: v_and_b32_e32 v1, v1, v16 +; GCN-NEXT: v_and_b32_e32 v1, v1, v17 +; GCN-NEXT: v_and_b32_e32 v1, 1, v1 +; GCN-NEXT: v_cmp_eq_u32_e64 s[6:7], 1, v1 +; GCN-NEXT: s_cbranch_scc0 .LBB0_1 +; GCN-NEXT: ; %bb.2: ; %bb1 +; GCN-NEXT: s_load_dword s2, s[4:5], 0x4 +; GCN-NEXT: s_load_dword s3, s[8:9], 0x8 +; GCN-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x0 +; GCN-NEXT: v_mov_b32_e32 v1, 0 +; GCN-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[6:7] +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: s_and_b32 s2, s2, 0xffff +; GCN-NEXT: s_mul_i32 s14, s14, s2 +; GCN-NEXT: s_add_i32 s3, s3, s14 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s3, v0 +; GCN-NEXT: v_lshlrev_b64 v[0:1], 2, v[0:1] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_add_u32_e32 v0, vcc, s0, v0 +; GCN-NEXT: v_addc_u32_e32 v1, vcc, v3, v1, vcc +; GCN-NEXT: flat_store_dword v[0:1], v2 +; GCN-NEXT: s_endpgm bb: store <2 x float> , ptr bitcast (ptr getelementptr ([100 x float], ptr addrspacecast ([100 x float] addrspace(1)* @data_generic to ptr), i64 0, i64 4) to ptr), align 4 store <2 x float> , ptr bitcast (ptr getelementptr ([100 x float], ptr addrspacecast ([100 x float] addrspace(1)* @data_reference to ptr), i64 0, i64 4) to ptr), align 4 diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll index 4212fd3b35cd8..2be4d26c7c5e7 100644 --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -200,30 +200,38 @@ define amdgpu_kernel void @test_vcmp_vcnd_f16(ptr addrspace(1) %out, half %x) #0 define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_nle_f32_e32 vcc_lo, 1.0, v1 +; GFX1032-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1032-NEXT: v_cmp_nge_f32_e32 vcc_lo, 0, v1 -; GFX1032-NEXT: v_cmp_nle_f32_e64 s0, 1.0, v1 -; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1032-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1032-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc_lo +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_f32_sop_and: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_nle_f32_e32 vcc, 1.0, v1 +; GFX1064-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX1064-NEXT: v_cmp_nge_f32_e32 vcc, 0, v1 -; GFX1064-NEXT: v_cmp_nle_f32_e64 s[0:1], 1.0, v1 -; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX1064-NEXT: v_and_b32_e32 v1, v1, v2 +; GFX1064-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2.0, 1.0, vcc +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %lid @@ -239,30 +247,38 @@ define amdgpu_kernel void @test_vop3_cmp_f32_sop_and(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_gt_i32_e32 vcc_lo, 1, v1 +; GFX1032-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, 0, v1 -; GFX1032-NEXT: v_cmp_gt_i32_e64 s0, 1, v1 -; GFX1032-NEXT: s_xor_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1032-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1032-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_i32_sop_xor: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_gt_i32_e32 vcc, 1, v1 +; GFX1064-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, 0, v1 -; GFX1064-NEXT: v_cmp_gt_i32_e64 s[0:1], 1, v1 -; GFX1064-NEXT: s_xor_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX1064-NEXT: v_xor_b32_e32 v1, v1, v2 +; GFX1064-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid @@ -278,30 +294,38 @@ define amdgpu_kernel void @test_vop3_cmp_i32_sop_xor(ptr addrspace(1) %arg) { define amdgpu_kernel void @test_vop3_cmp_u32_sop_or(ptr addrspace(1) %arg) { ; GFX1032-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1032: ; %bb.0: -; GFX1032-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1032-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1032-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1032-NEXT: s_waitcnt vmcnt(0) +; GFX1032-NEXT: v_cmp_gt_u32_e32 vcc_lo, 2, v1 +; GFX1032-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1032-NEXT: v_cmp_lt_u32_e32 vcc_lo, 3, v1 -; GFX1032-NEXT: v_cmp_gt_u32_e64 s0, 2, v1 -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, s0 -; GFX1032-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo +; GFX1032-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1032-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc_lo +; GFX1032-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1032-NEXT: s_endpgm ; ; GFX1064-LABEL: test_vop3_cmp_u32_sop_or: ; GFX1064: ; %bb.0: -; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x24 +; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 ; GFX1064-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: global_load_dword v1, v0, s[2:3] +; GFX1064-NEXT: global_load_dword v1, v0, s[0:1] ; GFX1064-NEXT: s_waitcnt vmcnt(0) +; GFX1064-NEXT: v_cmp_gt_u32_e32 vcc, 2, v1 +; GFX1064-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX1064-NEXT: v_cmp_lt_u32_e32 vcc, 3, v1 -; GFX1064-NEXT: v_cmp_gt_u32_e64 s[0:1], 2, v1 -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, s[0:1] -; GFX1064-NEXT: global_store_dword v0, v1, s[2:3] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc +; GFX1064-NEXT: v_or_b32_e32 v1, v1, v2 +; GFX1064-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 2, 1, vcc +; GFX1064-NEXT: global_store_dword v0, v1, s[0:1] ; GFX1064-NEXT: s_endpgm %lid = tail call i32 @llvm.amdgcn.workitem.id.x() %gep = getelementptr inbounds i32, ptr addrspace(1) %arg, i32 %lid @@ -1473,15 +1497,19 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1032-LABEL: test_preserve_condition_undef_flag: ; GFX1032: ; %bb.0: ; %bb0 ; GFX1032-NEXT: s_clause 0x1 -; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x2c -; GFX1032-NEXT: s_load_dword s1, s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX1032-NEXT: s_load_dword s1, s[4:5], 0x2c ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s2, s0, 1.0 -; GFX1032-NEXT: v_cmp_nlt_f32_e64 s1, s1, 1.0 -; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s0, 0 -; GFX1032-NEXT: s_or_b32 s1, s2, s1 -; GFX1032-NEXT: s_or_b32 s0, s1, s0 -; GFX1032-NEXT: s_and_b32 vcc_lo, exec_lo, s0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s0, 1.0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX1032-NEXT: v_cmp_nlt_f32_e64 s0, s1, 1.0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: v_cmp_ngt_f32_e64 s0, s1, 0 +; GFX1032-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX1032-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1032-NEXT: ; %bb.1: ; %bb1 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 @@ -1493,15 +1521,19 @@ define amdgpu_kernel void @test_preserve_condition_undef_flag(float %arg, i32 %a ; GFX1064-LABEL: test_preserve_condition_undef_flag: ; GFX1064: ; %bb.0: ; %bb0 ; GFX1064-NEXT: s_clause 0x1 -; GFX1064-NEXT: s_load_dword s6, s[4:5], 0x2c -; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s0, s[4:5], 0x24 +; GFX1064-NEXT: s_load_dword s2, s[4:5], 0x2c ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s6, 1.0 -; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[2:3], s2, 1.0 -; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[4:5], s6, 0 -; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] -; GFX1064-NEXT: s_or_b64 s[0:1], s[0:1], s[4:5] -; GFX1064-NEXT: s_and_b64 vcc, exec, s[0:1] +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s0, 1.0 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] +; GFX1064-NEXT: v_cmp_nlt_f32_e64 s[0:1], s2, 1.0 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: v_cmp_ngt_f32_e64 s[0:1], s2, 0 +; GFX1064-NEXT: v_or_b32_e32 v0, v1, v0 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: v_or_b32_e32 v0, v0, v1 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 ; GFX1064-NEXT: s_cbranch_vccnz .LBB26_2 ; GFX1064-NEXT: ; %bb.1: ; %bb1 ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 @@ -1540,10 +1572,12 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1032-NEXT: s_branch .LBB27_2 ; GFX1032-NEXT: .LBB27_1: ; %Flow ; GFX1032-NEXT: ; in Loop: Header=BB27_2 Depth=1 -; GFX1032-NEXT: s_xor_b32 s3, s1, -1 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s1 ; GFX1032-NEXT: s_add_i32 s2, s2, 1 -; GFX1032-NEXT: s_and_b32 s3, exec_lo, s3 -; GFX1032-NEXT: s_or_b32 s0, s3, s0 +; GFX1032-NEXT: v_not_b32_e32 v1, v1 +; GFX1032-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 +; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 ; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: s_cbranch_execz .LBB27_4 ; GFX1032-NEXT: .LBB27_2: ; %bb1 @@ -1578,10 +1612,12 @@ define amdgpu_kernel void @test_invert_true_phi_cond_break_loop(i32 %arg) #0 { ; GFX1064-NEXT: s_branch .LBB27_2 ; GFX1064-NEXT: .LBB27_1: ; %Flow ; GFX1064-NEXT: ; in Loop: Header=BB27_2 Depth=1 -; GFX1064-NEXT: s_xor_b64 s[6:7], s[2:3], -1 +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[2:3] ; GFX1064-NEXT: s_add_i32 s4, s4, 1 -; GFX1064-NEXT: s_and_b64 s[6:7], exec, s[6:7] -; GFX1064-NEXT: s_or_b64 s[0:1], s[6:7], s[0:1] +; GFX1064-NEXT: v_not_b32_e32 v1, v1 +; GFX1064-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] ; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1] ; GFX1064-NEXT: s_cbranch_execz .LBB27_4 ; GFX1064-NEXT: .LBB27_2: ; %bb1 @@ -1763,13 +1799,17 @@ define amdgpu_ps void @test_kill_i1_terminator_float() #0 { define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d) #0 { ; GFX1032-LABEL: test_kill_i1_terminator_i1: ; GFX1032: ; %bb.0: +; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v2, v3 +; GFX1032-NEXT: s_mov_b32 s0, exec_lo +; GFX1032-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX1032-NEXT: v_cmp_lt_i32_e32 vcc_lo, v0, v1 -; GFX1032-NEXT: v_cmp_lt_i32_e64 s0, v2, v3 -; GFX1032-NEXT: s_mov_b32 s1, exec_lo -; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_andn2_b32 s0, exec_lo, s0 -; GFX1032-NEXT: s_andn2_b32 s1, s1, s0 -; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s1 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo +; GFX1032-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_andn2_b32 s1, exec_lo, vcc_lo +; GFX1032-NEXT: s_andn2_b32 s0, s0, s1 +; GFX1032-NEXT: s_and_b32 exec_lo, exec_lo, s0 ; GFX1032-NEXT: v_mov_b32_e32 v0, 0 ; GFX1032-NEXT: exp mrt0 off, off, off, off ; GFX1032-NEXT: s_endpgm @@ -1779,13 +1819,17 @@ define amdgpu_gs void @test_kill_i1_terminator_i1(i32 %a, i32 %b, i32 %c, i32 %d ; ; GFX1064-LABEL: test_kill_i1_terminator_i1: ; GFX1064: ; %bb.0: +; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v2, v3 +; GFX1064-NEXT: s_mov_b64 s[0:1], exec +; GFX1064-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX1064-NEXT: v_cmp_lt_i32_e32 vcc, v0, v1 -; GFX1064-NEXT: v_cmp_lt_i32_e64 s[0:1], v2, v3 -; GFX1064-NEXT: s_mov_b64 s[2:3], exec -; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_andn2_b64 s[0:1], exec, s[0:1] -; GFX1064-NEXT: s_andn2_b64 s[2:3], s[2:3], s[0:1] -; GFX1064-NEXT: s_and_b64 exec, exec, s[2:3] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc +; GFX1064-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_andn2_b64 s[2:3], exec, vcc +; GFX1064-NEXT: s_andn2_b64 s[0:1], s[0:1], s[2:3] +; GFX1064-NEXT: s_and_b64 exec, exec, s[0:1] ; GFX1064-NEXT: v_mov_b32_e32 v0, 0 ; GFX1064-NEXT: exp mrt0 off, off, off, off ; GFX1064-NEXT: s_endpgm @@ -2495,11 +2539,15 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable ; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2528,12 +2576,16 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) { ; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX1064-NEXT: s_bitset1_b32 s1, 31 ; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable ; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2590,9 +2642,13 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1] ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable ; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2604,15 +2660,15 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 +; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 ; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 -; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 ; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 ; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 @@ -2621,10 +2677,14 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) { ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_bitset1_b32 s1, 31 ; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable ; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2681,12 +2741,16 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo ; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1032-NEXT: s_bitset1_b32 s0, 31 ; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable ; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2715,12 +2779,16 @@ define amdgpu_kernel void @icmp32(i32 %n, i32 %s) { ; GFX1064-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_lshr_b32 s0, vcc_lo, 1 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX1064-NEXT: s_bitset1_b32 s0, 31 ; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable ; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2758,15 +2826,15 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0 +; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0 ; GFX1032-NEXT: v_rcp_f32_e32 v2, v1 ; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1032-NEXT: v_div_scale_f32 v3, vcc_lo, v0, s0, v0 -; GFX1032-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX1032-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX1032-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX1032-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX1032-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX1032-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX1032-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX1032-NEXT: v_div_fmas_f32 v1, v1, v2, v3 ; GFX1032-NEXT: v_div_fixup_f32 v1, v1, s0, v0 ; GFX1032-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1032-NEXT: v_fma_f32 v0, -v1, s0, v0 @@ -2775,10 +2843,14 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0 ; GFX1032-NEXT: s_bitset1_b32 s0, 31 ; GFX1032-NEXT: s_ff1_i32_b32 s0, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc_lo ; GFX1032-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1032-NEXT: s_cselect_b32 s0, -1, 0 -; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0 -; GFX1032-NEXT: s_and_saveexec_b32 s1, s0 +; GFX1032-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX1032-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo ; GFX1032-NEXT: ; %bb.1: ; %if.then ; GFX1032-NEXT: ; divergent unreachable ; GFX1032-NEXT: ; %bb.2: ; %UnifiedReturnBlock @@ -2790,15 +2862,15 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0 +; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0 ; GFX1064-NEXT: v_rcp_f32_e32 v2, v1 ; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0 ; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2 -; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0 -; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2 -; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3 -; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2 -; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3 -; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4 +; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2 +; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4 +; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2 +; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4 +; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3 ; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0 ; GFX1064-NEXT: v_trunc_f32_e32 v1, v1 ; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0 @@ -2807,10 +2879,14 @@ define amdgpu_kernel void @fcmp32(float %n, float %s) { ; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0 ; GFX1064-NEXT: s_bitset1_b32 s0, 31 ; GFX1064-NEXT: s_ff1_i32_b32 s0, s0 +; GFX1064-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX1064-NEXT: s_cmp_gt_u32 s0, 9 ; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1] -; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], s[0:1] +; GFX1064-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX1064-NEXT: v_and_b32_e32 v0, v0, v1 +; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc ; GFX1064-NEXT: ; %bb.1: ; %if.then ; GFX1064-NEXT: ; divergent unreachable ; GFX1064-NEXT: ; %bb.2: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/wqm.ll b/llvm/test/CodeGen/AMDGPU/wqm.ll index 1ca2a8ada68ea..0ccefd4849e90 100644 --- a/llvm/test/CodeGen/AMDGPU/wqm.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm.ll @@ -3481,8 +3481,12 @@ define amdgpu_gs void @wqm_init_exec_wwm() { ; GFX9-W64-NEXT: s_cselect_b64 s[2:3], -1, 0 ; GFX9-W64-NEXT: s_cmp_lg_u64 s[0:1], 0 ; GFX9-W64-NEXT: s_cselect_b64 s[0:1], -1, 0 -; GFX9-W64-NEXT: s_xor_b64 s[0:1], s[2:3], s[0:1] -; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s[0:1] +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[2:3] +; GFX9-W64-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[0:1] +; GFX9-W64-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX9-W64-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-W64-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GFX9-W64-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc ; GFX9-W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-W64-NEXT: exp mrt0 off, off, off, off ; GFX9-W64-NEXT: s_endpgm @@ -3495,10 +3499,14 @@ define amdgpu_gs void @wqm_init_exec_wwm() { ; GFX10-W32-NEXT: s_mov_b32 s0, s1 ; GFX10-W32-NEXT: s_cselect_b32 s2, -1, 0 ; GFX10-W32-NEXT: s_cmp_lg_u64 s[0:1], 0 -; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1, s2 ; GFX10-W32-NEXT: s_cselect_b32 s0, -1, 0 -; GFX10-W32-NEXT: s_xor_b32 s0, s2, s0 -; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, s0 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-W32-NEXT: v_xor_b32_e32 v0, v0, v1 +; GFX10-W32-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-W32-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-W32-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 +; GFX10-W32-NEXT: v_cndmask_b32_e64 v0, 0, 1.0, vcc_lo ; GFX10-W32-NEXT: exp mrt0 off, off, off, off ; GFX10-W32-NEXT: s_endpgm call void @llvm.amdgcn.init.exec(i64 0) diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 00bb7b24786f5..bd05f3fb7dc63 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefix=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefix=VI %s define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: @@ -117,16 +117,20 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; SI-NEXT: s_mov_b32 s12, s2 ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: s_mov_b32 s14, s6 -; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: buffer_load_dword v0, off, s[8:11], 0 +; SI-NEXT: s_mov_b32 s15, s7 ; SI-NEXT: buffer_load_dword v1, off, s[12:15], 0 ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v0 +; SI-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_cmp_le_f32_e64 s[0:1], 0, v1 -; SI-NEXT: s_xor_b64 vcc, s[0:1], vcc +; SI-NEXT: v_cmp_le_f32_e32 vcc, 0, v1 +; SI-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc +; SI-NEXT: v_xor_b32_e32 v2, v3, v2 +; SI-NEXT: v_and_b32_e32 v2, 1, v2 +; SI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc ; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; SI-NEXT: s_endpgm @@ -137,19 +141,23 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; VI-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s4 +; VI-NEXT: v_mov_b32_e32 v2, s5 +; VI-NEXT: flat_load_dword v2, v[1:2] ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 -; VI-NEXT: flat_load_dword v4, v[0:1] -; VI-NEXT: flat_load_dword v2, v[2:3] +; VI-NEXT: flat_load_dword v3, v[0:1] ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_waitcnt vmcnt(1) -; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v4 +; VI-NEXT: v_cmp_le_f32_e32 vcc, 1.0, v2 +; VI-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cmp_le_f32_e64 s[0:1], 1.0, v2 -; VI-NEXT: s_xor_b64 vcc, vcc, s[0:1] -; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; VI-NEXT: v_cmp_le_f32_e32 vcc, 0, v3 +; VI-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc +; VI-NEXT: v_xor_b32_e32 v4, v5, v4 +; VI-NEXT: v_and_b32_e32 v4, 1, v4 +; VI-NEXT: v_cmp_eq_u32_e32 vcc, 1, v4 +; VI-NEXT: v_cndmask_b32_e32 v2, v2, v3, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %a = load float, ptr addrspace(1) %in0 @@ -813,5 +821,3 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add store i64 %or, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}} diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll index fd4d325ae9374..3c2b604be17a0 100644 --- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll +++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll @@ -1,7 +1,8 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} -; CHECK-LABEL: m2and_rr +target triple = "nvptx64-nvidia-cuda" + define i1 @m2and_rr(i1 %a, i1 %b) { ; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}} ; CHECK-NOT: mul @@ -9,25 +10,52 @@ define i1 @m2and_rr(i1 %a, i1 %b) { ret i1 %r } -; CHECK-LABEL: m2and_ri define i1 @m2and_ri(i1 %a) { -; CHECK-NOT: mul +; CHECK-LABEL: m2and_ri( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %r1, [m2and_ri_param_0]; +; CHECK-NEXT: and.b32 %r2, %r1, 1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = mul i1 %a, 1 ret i1 %r } -; CHECK-LABEL: select2or define i1 @select2or(i1 %a, i1 %b) { -; CHECK: or.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK-NOT: selp +; CHECK-LABEL: select2or( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [select2or_param_0]; +; CHECK-NEXT: ld.param.u8 %rs2, [select2or_param_1]; +; CHECK-NEXT: or.b16 %rs4, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; +; CHECK-NEXT: and.b32 %r2, %r1, 1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = select i1 %a, i1 1, i1 %b ret i1 %r } -; CHECK-LABEL: select2and define i1 @select2and(i1 %a, i1 %b) { -; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}} -; CHECK-NOT: selp +; CHECK-LABEL: select2and( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u8 %rs1, [select2and_param_0]; +; CHECK-NEXT: ld.param.u8 %rs2, [select2and_param_1]; +; CHECK-NEXT: and.b16 %rs4, %rs1, %rs2; +; CHECK-NEXT: cvt.u32.u16 %r1, %rs4; +; CHECK-NEXT: and.b32 %r2, %r1, 1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r2; +; CHECK-NEXT: ret; %r = select i1 %a, i1 %b, i1 0 ret i1 %r } diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll new file mode 100644 index 0000000000000..36518245ffbf6 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll @@ -0,0 +1,100 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %} + +target triple = "nvptx64-nvidia-cuda" + +define i32 @trunc(i64 %a, i64 %b) { +; CHECK-LABEL: trunc( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [trunc_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [trunc_param_1]; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %or = or i64 %a, %b + %trunc = trunc i64 %or to i32 + ret i32 %trunc +} + +define i32 @trunc_not(i64 %a, i64 %b) { +; CHECK-LABEL: trunc_not( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [trunc_not_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [trunc_not_param_1]; +; CHECK-NEXT: or.b64 %rd3, %rd1, %rd2; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd3; +; CHECK-NEXT: mov.b64 %rd4, 0; +; CHECK-NEXT: st.u64 [%rd4], %rd3; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; + %or = or i64 %a, %b + %trunc = trunc i64 %or to i32 + store i64 %or, ptr null + ret i32 %trunc +} + +define i32 @trunc_cvt(i64 %a, i64 %b) { +; CHECK-LABEL: trunc_cvt( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u32 %r1, [trunc_cvt_param_0]; +; CHECK-NEXT: ld.param.u32 %r2, [trunc_cvt_param_1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: or.b32 %r4, %r3, %r1; +; CHECK-NEXT: st.param.b32 [func_retval0], %r4; +; CHECK-NEXT: ret; + %add = add i64 %a, %b + %or = or i64 %add, %a + %trunc = trunc i64 %or to i32 + ret i32 %trunc +} + +define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) { +; CHECK-LABEL: trunc_cvt_not( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [trunc_cvt_not_param_0]; +; CHECK-NEXT: ld.param.u64 %rd2, [trunc_cvt_not_param_1]; +; CHECK-NEXT: add.s64 %rd3, %rd1, %rd2; +; CHECK-NEXT: ld.param.u64 %rd4, [trunc_cvt_not_param_2]; +; CHECK-NEXT: st.u64 [%rd4], %rd3; +; CHECK-NEXT: cvt.u32.u64 %r1, %rd3; +; CHECK-NEXT: cvt.u32.u64 %r2, %rd1; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %add = add i64 %a, %b + store i64 %add, ptr %p + %or = or i64 %add, %a + %trunc = trunc i64 %or to i32 + ret i32 %trunc +} + +define i16 @trunc_i32_to_i16_not(i32 %a, i32 %b) { +; CHECK-LABEL: trunc_i32_to_i16_not( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u16 %r1, [trunc_i32_to_i16_not_param_0]; +; CHECK-NEXT: ld.param.u16 %r2, [trunc_i32_to_i16_not_param_1]; +; CHECK-NEXT: or.b32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; + %or = or i32 %a, %b + %trunc = trunc i32 %or to i16 + ret i16 %trunc +} diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll index 9da361455a656..b22164f9c2888 100644 --- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll +++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll @@ -11,8 +11,8 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) { ; CHECK-PTX-LABEL: variadics1( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .reg .b32 %r<11>; -; CHECK-PTX-NEXT: .reg .b64 %rd<11>; +; CHECK-PTX-NEXT: .reg .b32 %r<12>; +; CHECK-PTX-NEXT: .reg .b64 %rd<8>; ; CHECK-PTX-NEXT: .reg .b64 %fd<7>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry @@ -26,23 +26,21 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: add.s32 %r7, %r5, %r6; ; CHECK-PTX-NEXT: add.s64 %rd2, %rd1, 19; ; CHECK-PTX-NEXT: and.b64 %rd3, %rd2, -8; -; CHECK-PTX-NEXT: ld.u64 %rd4, [%rd3]; -; CHECK-PTX-NEXT: cvt.u64.u32 %rd5, %r7; -; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, %rd4; -; CHECK-PTX-NEXT: cvt.u32.u64 %r8, %rd6; -; CHECK-PTX-NEXT: add.s64 %rd7, %rd3, 15; -; CHECK-PTX-NEXT: and.b64 %rd8, %rd7, -8; -; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd8]; -; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r8; +; CHECK-PTX-NEXT: ld.u32 %r8, [%rd3]; +; CHECK-PTX-NEXT: add.s32 %r9, %r7, %r8; +; CHECK-PTX-NEXT: add.s64 %rd4, %rd3, 15; +; CHECK-PTX-NEXT: and.b64 %rd5, %rd4, -8; +; CHECK-PTX-NEXT: ld.f64 %fd1, [%rd5]; +; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd2, %r9; ; CHECK-PTX-NEXT: add.rn.f64 %fd3, %fd2, %fd1; -; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r9, %fd3; -; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, 15; -; CHECK-PTX-NEXT: and.b64 %rd10, %rd9, -8; -; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd10]; -; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r9; +; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd3; +; CHECK-PTX-NEXT: add.s64 %rd6, %rd5, 15; +; CHECK-PTX-NEXT: and.b64 %rd7, %rd6, -8; +; CHECK-PTX-NEXT: ld.f64 %fd4, [%rd7]; +; CHECK-PTX-NEXT: cvt.rn.f64.s32 %fd5, %r10; ; CHECK-PTX-NEXT: add.rn.f64 %fd6, %fd5, %fd4; -; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r10, %fd6; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r10; +; CHECK-PTX-NEXT: cvt.rzi.s32.f64 %r11, %fd6; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r11; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -152,8 +150,8 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: .reg .b64 %SP; ; CHECK-PTX-NEXT: .reg .b64 %SPL; ; CHECK-PTX-NEXT: .reg .b16 %rs<4>; -; CHECK-PTX-NEXT: .reg .b32 %r<7>; -; CHECK-PTX-NEXT: .reg .b64 %rd<9>; +; CHECK-PTX-NEXT: .reg .b32 %r<8>; +; CHECK-PTX-NEXT: .reg .b64 %rd<6>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: mov.b64 %SPL, __local_depot2; @@ -170,13 +168,11 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) { ; CHECK-PTX-NEXT: st.local.u8 [%rd3+1], %rs2; ; CHECK-PTX-NEXT: ld.u8 %rs3, [%rd5+5]; ; CHECK-PTX-NEXT: st.local.u8 [%rd3], %rs3; -; CHECK-PTX-NEXT: ld.u64 %rd6, [%rd5+8]; -; CHECK-PTX-NEXT: add.s32 %r4, %r1, %r2; -; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r3; -; CHECK-PTX-NEXT: cvt.u64.u32 %rd7, %r5; -; CHECK-PTX-NEXT: add.s64 %rd8, %rd7, %rd6; -; CHECK-PTX-NEXT: cvt.u32.u64 %r6, %rd8; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r6; +; CHECK-PTX-NEXT: ld.u32 %r4, [%rd5+8]; +; CHECK-PTX-NEXT: add.s32 %r5, %r1, %r2; +; CHECK-PTX-NEXT: add.s32 %r6, %r5, %r3; +; CHECK-PTX-NEXT: add.s32 %r7, %r6, %r4; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r7; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 @@ -337,20 +333,19 @@ entry: define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) { ; CHECK-PTX-LABEL: variadics4( ; CHECK-PTX: { -; CHECK-PTX-NEXT: .reg .b32 %r<2>; -; CHECK-PTX-NEXT: .reg .b64 %rd<10>; +; CHECK-PTX-NEXT: .reg .b32 %r<6>; +; CHECK-PTX-NEXT: .reg .b64 %rd<5>; ; CHECK-PTX-EMPTY: ; CHECK-PTX-NEXT: // %bb.0: // %entry ; CHECK-PTX-NEXT: ld.param.u64 %rd2, [variadics4_param_1]; ; CHECK-PTX-NEXT: add.s64 %rd3, %rd2, 7; ; CHECK-PTX-NEXT: and.b64 %rd4, %rd3, -8; -; CHECK-PTX-NEXT: ld.u64 %rd5, [%rd4]; -; CHECK-PTX-NEXT: ld.param.u64 %rd6, [variadics4_param_0]; -; CHECK-PTX-NEXT: ld.param.u64 %rd7, [variadics4_param_0+8]; -; CHECK-PTX-NEXT: add.s64 %rd8, %rd6, %rd7; -; CHECK-PTX-NEXT: add.s64 %rd9, %rd8, %rd5; -; CHECK-PTX-NEXT: cvt.u32.u64 %r1, %rd9; -; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-PTX-NEXT: ld.u32 %r1, [%rd4]; +; CHECK-PTX-NEXT: ld.param.u32 %r2, [variadics4_param_0]; +; CHECK-PTX-NEXT: ld.param.u32 %r3, [variadics4_param_0+8]; +; CHECK-PTX-NEXT: add.s32 %r4, %r2, %r3; +; CHECK-PTX-NEXT: add.s32 %r5, %r4, %r1; +; CHECK-PTX-NEXT: st.param.b32 [func_retval0], %r5; ; CHECK-PTX-NEXT: ret; entry: %vlist = alloca ptr, align 8 diff --git a/llvm/test/CodeGen/X86/2012-08-16-setcc.ll b/llvm/test/CodeGen/X86/2012-08-16-setcc.ll index 89ae5680e3ba9..f82439a7d82ee 100644 --- a/llvm/test/CodeGen/X86/2012-08-16-setcc.ll +++ b/llvm/test/CodeGen/X86/2012-08-16-setcc.ll @@ -6,8 +6,8 @@ define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) { ; CHECK-LABEL: and_1: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: testb %dil, %sil ; CHECK-NEXT: cmovnel %edx, %eax ; CHECK-NEXT: retq %1 = and i8 %b, %a @@ -19,7 +19,7 @@ define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) { define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: and_2: ; CHECK: # %bb.0: -; CHECK-NEXT: testl %edi, %esi +; CHECK-NEXT: testb %dil, %sil ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %1 = and i8 %b, %a @@ -31,7 +31,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) { ; CHECK-LABEL: xor_1: ; CHECK: # %bb.0: ; CHECK-NEXT: xorl %eax, %eax -; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: xorb %dil, %sil ; CHECK-NEXT: cmovnel %edx, %eax ; CHECK-NEXT: retq %1 = xor i8 %b, %a @@ -43,7 +43,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) { define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) { ; CHECK-LABEL: xor_2: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: xorb %dil, %sil ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %1 = xor i8 %b, %a diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll index 1a8d33f5b3480..f4b54bf22d0f5 100644 --- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll +++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll @@ -11,62 +11,63 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind { ; I386-NOCMOV-LABEL: t0: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: addl %ecx, %eax -; I386-NOCMOV-NEXT: addl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: addb %cl, %al +; I386-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl ; I386-NOCMOV-NEXT: cmpb %cl, %al ; I386-NOCMOV-NEXT: jg .LBB0_2 ; I386-NOCMOV-NEXT: # %bb.1: ; I386-NOCMOV-NEXT: movl %ecx, %eax ; I386-NOCMOV-NEXT: .LBB0_2: -; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: t0: ; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: addl %eax, %ecx -; I386-CMOV-NEXT: addl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: addb %al, %cl +; I386-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I386-CMOV-NEXT: cmpb %al, %cl +; I386-CMOV-NEXT: movzbl %cl, %ecx +; I386-CMOV-NEXT: movzbl %al, %eax ; I386-CMOV-NEXT: cmovgl %ecx, %eax ; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-CMOV-NEXT: retl ; ; I686-NOCMOV-LABEL: t0: ; I686-NOCMOV: # %bb.0: -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-NOCMOV-NEXT: addl %ecx, %eax -; I686-NOCMOV-NEXT: addl {{[0-9]+}}(%esp), %ecx +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-NOCMOV-NEXT: addb %cl, %al +; I686-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl ; I686-NOCMOV-NEXT: cmpb %cl, %al ; I686-NOCMOV-NEXT: jg .LBB0_2 ; I686-NOCMOV-NEXT: # %bb.1: ; I686-NOCMOV-NEXT: movl %ecx, %eax ; I686-NOCMOV-NEXT: .LBB0_2: -; I686-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-NOCMOV-NEXT: retl ; ; I686-CMOV-LABEL: t0: ; I686-CMOV: # %bb.0: -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-CMOV-NEXT: addl %eax, %ecx -; I686-CMOV-NEXT: addl {{[0-9]+}}(%esp), %eax +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-CMOV-NEXT: addb %al, %cl +; I686-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I686-CMOV-NEXT: cmpb %al, %cl +; I686-CMOV-NEXT: movzbl %cl, %ecx +; I686-CMOV-NEXT: movzbl %al, %eax ; I686-CMOV-NEXT: cmovgl %ecx, %eax ; I686-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-CMOV-NEXT: retl ; ; X86_64-LABEL: t0: ; X86_64: # %bb.0: -; X86_64-NEXT: # kill: def $edx killed $edx def $rdx -; X86_64-NEXT: # kill: def $esi killed $esi def $rsi -; X86_64-NEXT: # kill: def $edi killed $edi def $rdi -; X86_64-NEXT: leal (%rdi,%rdx), %ecx -; X86_64-NEXT: leal (%rsi,%rdx), %eax -; X86_64-NEXT: cmpb %al, %cl +; X86_64-NEXT: addb %dl, %dil +; X86_64-NEXT: addb %dl, %sil +; X86_64-NEXT: cmpb %sil, %dil +; X86_64-NEXT: movzbl %dil, %ecx +; X86_64-NEXT: movzbl %sil, %eax ; X86_64-NEXT: cmovgl %ecx, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq @@ -84,64 +85,63 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind { define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nounwind { ; I386-NOCMOV-LABEL: neg_only_one_truncation: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: addl %ecx, %eax -; I386-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: addb %al, %cl +; I386-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I386-NOCMOV-NEXT: cmpb %cl, %al ; I386-NOCMOV-NEXT: jg .LBB1_2 ; I386-NOCMOV-NEXT: # %bb.1: ; I386-NOCMOV-NEXT: movl %ecx, %eax ; I386-NOCMOV-NEXT: .LBB1_2: -; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: neg_only_one_truncation: ; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: addl %eax, %ecx +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: addb %al, %cl ; I386-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al -; I386-CMOV-NEXT: cmpb %al, %cl -; I386-CMOV-NEXT: movzbl %al, %eax -; I386-CMOV-NEXT: cmovgl %ecx, %eax +; I386-CMOV-NEXT: cmpb %cl, %al +; I386-CMOV-NEXT: movzbl %al, %edx +; I386-CMOV-NEXT: movzbl %cl, %eax +; I386-CMOV-NEXT: cmovgl %edx, %eax ; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-CMOV-NEXT: retl ; ; I686-NOCMOV-LABEL: neg_only_one_truncation: ; I686-NOCMOV: # %bb.0: -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-NOCMOV-NEXT: addl %ecx, %eax -; I686-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-NOCMOV-NEXT: addb %al, %cl +; I686-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I686-NOCMOV-NEXT: cmpb %cl, %al ; I686-NOCMOV-NEXT: jg .LBB1_2 ; I686-NOCMOV-NEXT: # %bb.1: ; I686-NOCMOV-NEXT: movl %ecx, %eax ; I686-NOCMOV-NEXT: .LBB1_2: -; I686-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-NOCMOV-NEXT: retl ; ; I686-CMOV-LABEL: neg_only_one_truncation: ; I686-CMOV: # %bb.0: -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-CMOV-NEXT: addl %eax, %ecx +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-CMOV-NEXT: addb %al, %cl ; I686-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al -; I686-CMOV-NEXT: cmpb %al, %cl -; I686-CMOV-NEXT: movzbl %al, %eax -; I686-CMOV-NEXT: cmovgl %ecx, %eax +; I686-CMOV-NEXT: cmpb %cl, %al +; I686-CMOV-NEXT: movzbl %al, %edx +; I686-CMOV-NEXT: movzbl %cl, %eax +; I686-CMOV-NEXT: cmovgl %edx, %eax ; I686-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-CMOV-NEXT: retl ; ; X86_64-LABEL: neg_only_one_truncation: ; X86_64: # %bb.0: -; X86_64-NEXT: # kill: def $edx killed $edx def $rdx -; X86_64-NEXT: # kill: def $edi killed $edi def $rdi -; X86_64-NEXT: leal (%rdi,%rdx), %ecx -; X86_64-NEXT: addb %sil, %dl -; X86_64-NEXT: cmpb %dl, %cl -; X86_64-NEXT: movzbl %dl, %eax +; X86_64-NEXT: addb %dl, %sil +; X86_64-NEXT: addb %dl, %dil +; X86_64-NEXT: cmpb %sil, %dil +; X86_64-NEXT: movzbl %dil, %ecx +; X86_64-NEXT: movzbl %sil, %eax ; X86_64-NEXT: cmovgl %ecx, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq @@ -159,62 +159,63 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nounwind { ; I386-NOCMOV-LABEL: neg_type_mismatch: ; I386-NOCMOV: # %bb.0: -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: addl %ecx, %eax -; I386-NOCMOV-NEXT: addw {{[0-9]+}}(%esp), %cx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-NOCMOV-NEXT: addb %cl, %al +; I386-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl ; I386-NOCMOV-NEXT: cmpb %cl, %al ; I386-NOCMOV-NEXT: jg .LBB2_2 ; I386-NOCMOV-NEXT: # %bb.1: ; I386-NOCMOV-NEXT: movl %ecx, %eax ; I386-NOCMOV-NEXT: .LBB2_2: -; I386-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-NOCMOV-NEXT: retl ; ; I386-CMOV-LABEL: neg_type_mismatch: ; I386-CMOV: # %bb.0: -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: addl %eax, %ecx -; I386-CMOV-NEXT: addw {{[0-9]+}}(%esp), %ax +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-CMOV-NEXT: addb %al, %cl +; I386-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I386-CMOV-NEXT: cmpb %al, %cl +; I386-CMOV-NEXT: movzbl %cl, %ecx +; I386-CMOV-NEXT: movzbl %al, %eax ; I386-CMOV-NEXT: cmovgl %ecx, %eax ; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-CMOV-NEXT: retl ; ; I686-NOCMOV-LABEL: neg_type_mismatch: ; I686-NOCMOV: # %bb.0: -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-NOCMOV-NEXT: addl %ecx, %eax -; I686-NOCMOV-NEXT: addw {{[0-9]+}}(%esp), %cx +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-NOCMOV-NEXT: addb %cl, %al +; I686-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl ; I686-NOCMOV-NEXT: cmpb %cl, %al ; I686-NOCMOV-NEXT: jg .LBB2_2 ; I686-NOCMOV-NEXT: # %bb.1: ; I686-NOCMOV-NEXT: movl %ecx, %eax ; I686-NOCMOV-NEXT: .LBB2_2: -; I686-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-NOCMOV-NEXT: retl ; ; I686-CMOV-LABEL: neg_type_mismatch: ; I686-CMOV: # %bb.0: -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-CMOV-NEXT: addl %eax, %ecx -; I686-CMOV-NEXT: addw {{[0-9]+}}(%esp), %ax +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-CMOV-NEXT: addb %al, %cl +; I686-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I686-CMOV-NEXT: cmpb %al, %cl +; I686-CMOV-NEXT: movzbl %cl, %ecx +; I686-CMOV-NEXT: movzbl %al, %eax ; I686-CMOV-NEXT: cmovgl %ecx, %eax ; I686-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-CMOV-NEXT: retl ; ; X86_64-LABEL: neg_type_mismatch: ; X86_64: # %bb.0: -; X86_64-NEXT: # kill: def $edx killed $edx def $rdx -; X86_64-NEXT: # kill: def $esi killed $esi def $rsi -; X86_64-NEXT: # kill: def $edi killed $edi def $rdi -; X86_64-NEXT: leal (%rdi,%rdx), %ecx -; X86_64-NEXT: leal (%rsi,%rdx), %eax -; X86_64-NEXT: cmpb %al, %cl +; X86_64-NEXT: addb %dl, %dil +; X86_64-NEXT: addb %dl, %sil +; X86_64-NEXT: cmpb %sil, %dil +; X86_64-NEXT: movzbl %dil, %ecx +; X86_64-NEXT: movzbl %sil, %eax ; X86_64-NEXT: cmovgl %ecx, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq @@ -234,8 +235,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw ; I386-NOCMOV-LABEL: negative_CopyFromReg: ; I386-NOCMOV: # %bb.0: ; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; I386-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-NOCMOV-NEXT: addl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I386-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl ; I386-NOCMOV-NEXT: cmpb %cl, %al ; I386-NOCMOV-NEXT: jg .LBB3_2 ; I386-NOCMOV-NEXT: # %bb.1: @@ -246,9 +247,10 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw ; I386-CMOV-LABEL: negative_CopyFromReg: ; I386-CMOV: # %bb.0: ; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I386-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I386-CMOV-NEXT: addl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I386-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I386-CMOV-NEXT: cmpb %al, %cl +; I386-CMOV-NEXT: movzbl %al, %eax ; I386-CMOV-NEXT: cmovgl %ecx, %eax ; I386-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I386-CMOV-NEXT: retl @@ -256,8 +258,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw ; I686-NOCMOV-LABEL: negative_CopyFromReg: ; I686-NOCMOV: # %bb.0: ; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; I686-NOCMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-NOCMOV-NEXT: addl {{[0-9]+}}(%esp), %ecx +; I686-NOCMOV-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; I686-NOCMOV-NEXT: addb {{[0-9]+}}(%esp), %cl ; I686-NOCMOV-NEXT: cmpb %cl, %al ; I686-NOCMOV-NEXT: jg .LBB3_2 ; I686-NOCMOV-NEXT: # %bb.1: @@ -268,19 +270,19 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw ; I686-CMOV-LABEL: negative_CopyFromReg: ; I686-CMOV: # %bb.0: ; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %ecx -; I686-CMOV-NEXT: movl {{[0-9]+}}(%esp), %eax -; I686-CMOV-NEXT: addl {{[0-9]+}}(%esp), %eax +; I686-CMOV-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; I686-CMOV-NEXT: addb {{[0-9]+}}(%esp), %al ; I686-CMOV-NEXT: cmpb %al, %cl +; I686-CMOV-NEXT: movzbl %al, %eax ; I686-CMOV-NEXT: cmovgl %ecx, %eax ; I686-CMOV-NEXT: # kill: def $al killed $al killed $eax ; I686-CMOV-NEXT: retl ; ; X86_64-LABEL: negative_CopyFromReg: ; X86_64: # %bb.0: -; X86_64-NEXT: # kill: def $edx killed $edx def $rdx -; X86_64-NEXT: # kill: def $esi killed $esi def $rsi -; X86_64-NEXT: leal (%rsi,%rdx), %eax -; X86_64-NEXT: cmpb %al, %dil +; X86_64-NEXT: addb %dl, %sil +; X86_64-NEXT: cmpb %sil, %dil +; X86_64-NEXT: movzbl %sil, %eax ; X86_64-NEXT: cmovgl %edi, %eax ; X86_64-NEXT: # kill: def $al killed $al killed $eax ; X86_64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/add-and-not.ll b/llvm/test/CodeGen/X86/add-and-not.ll index 10e3a6bf6d533..3f1798e2f9bdd 100644 --- a/llvm/test/CodeGen/X86/add-and-not.ll +++ b/llvm/test/CodeGen/X86/add-and-not.ll @@ -13,8 +13,8 @@ define i8 @add_and_xor(i8 %x, i8 %y) { ; ; X64-LABEL: add_and_xor: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %xor = xor i8 %x, -1 @@ -79,8 +79,8 @@ define i8 @add_and_xor_commuted1(i8 %x, i8 %y) { ; ; X64-LABEL: add_and_xor_commuted1: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %xor = xor i8 %x, -1 @@ -98,8 +98,8 @@ define i8 @add_and_xor_commuted2(i8 %x, i8 %y) { ; ; X64-LABEL: add_and_xor_commuted2: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %xor = xor i8 %x, -1 @@ -117,8 +117,8 @@ define i8 @add_and_xor_commuted3(i8 %x, i8 %y) { ; ; X64-LABEL: add_and_xor_commuted3: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %xor = xor i8 %x, -1 diff --git a/llvm/test/CodeGen/X86/and-encoding.ll b/llvm/test/CodeGen/X86/and-encoding.ll index 248686ff8b7a2..64897a1d43222 100644 --- a/llvm/test/CodeGen/X86/and-encoding.ll +++ b/llvm/test/CodeGen/X86/and-encoding.ll @@ -22,7 +22,7 @@ define void @f1() nounwind { define void @f2(i16 %x, ptr%y) nounwind { ; CHECK-LABEL: f2: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i16 %x to i1 @@ -33,7 +33,7 @@ define void @f2(i16 %x, ptr%y) nounwind { define void @f3(i32 %x, ptr%y) nounwind { ; CHECK-LABEL: f3: ; CHECK: # %bb.0: -; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01] ; CHECK-NEXT: movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e] ; CHECK-NEXT: retq # encoding: [0xc3] %c = trunc i32 %x to i1 diff --git a/llvm/test/CodeGen/X86/and-with-overflow.ll b/llvm/test/CodeGen/X86/and-with-overflow.ll index a63f6cc6ea7e2..70c8317915239 100644 --- a/llvm/test/CodeGen/X86/and-with-overflow.ll +++ b/llvm/test/CodeGen/X86/and-with-overflow.ll @@ -46,8 +46,8 @@ define i8 @and_i8_rr(i8 zeroext %0, i8 zeroext %1) { ; ; X64-LABEL: and_i8_rr: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: andl %edi, %eax +; X64-NEXT: andb %dil, %sil +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll index 3379ac9dec893..6820166952237 100644 --- a/llvm/test/CodeGen/X86/apx/and.ll +++ b/llvm/test/CodeGen/X86/apx/and.ll @@ -6,14 +6,12 @@ define i8 @and8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: and8rr: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7] -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: andb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: and8rr: ; NF: # %bb.0: # %entry -; NF-NEXT: {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7] -; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: {nf} andb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0xf7] ; NF-NEXT: retq # encoding: [0xc3] entry: %and = and i8 %a, %b diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll index 12ee5fc3404db..0fe511040426a 100644 --- a/llvm/test/CodeGen/X86/apx/or.ll +++ b/llvm/test/CodeGen/X86/apx/or.ll @@ -6,14 +6,12 @@ define i8 @or8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: or8rr: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7] -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: orb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x08,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: or8rr: ; NF: # %bb.0: # %entry -; NF-NEXT: {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7] -; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: {nf} orb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x08,0xf7] ; NF-NEXT: retq # encoding: [0xc3] entry: %or = or i8 %a, %b diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll index e083c6580398c..50367cfa98b83 100644 --- a/llvm/test/CodeGen/X86/apx/xor.ll +++ b/llvm/test/CodeGen/X86/apx/xor.ll @@ -6,14 +6,12 @@ define i8 @xor8rr(i8 noundef %a, i8 noundef %b) { ; CHECK-LABEL: xor8rr: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7] -; CHECK-NEXT: # kill: def $al killed $al killed $eax +; CHECK-NEXT: xorb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0xf7] ; CHECK-NEXT: retq # encoding: [0xc3] ; ; NF-LABEL: xor8rr: ; NF: # %bb.0: # %entry -; NF-NEXT: {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7] -; NF-NEXT: # kill: def $al killed $al killed $eax +; NF-NEXT: {nf} xorb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0xf7] ; NF-NEXT: retq # encoding: [0xc3] entry: %xor = xor i8 %a, %b @@ -429,8 +427,8 @@ entry: define i1 @xorflag8rr(i8 %a, i8 %b) { ; CHECK-LABEL: xorflag8rr: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] -; CHECK-NEXT: xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff] +; CHECK-NEXT: xorb %dil, %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0xfe] +; CHECK-NEXT: xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff] ; CHECK-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; CHECK-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; CHECK-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte @@ -438,8 +436,8 @@ define i1 @xorflag8rr(i8 %a, i8 %b) { ; ; NF-LABEL: xorflag8rr: ; NF: # %bb.0: -; NF-NEXT: xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe] -; NF-NEXT: xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff] +; NF-NEXT: {nf} xorb %dil, %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0xfe] +; NF-NEXT: xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff] ; NF-NEXT: sete %al # encoding: [0x0f,0x94,0xc0] ; NF-NEXT: movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A] ; NF-NEXT: # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll index e8e0ee0b7ef49..7447f6a7a4537 100644 --- a/llvm/test/CodeGen/X86/atomic-unordered.ll +++ b/llvm/test/CodeGen/X86/atomic-unordered.ll @@ -2009,7 +2009,7 @@ define i32 @split_load(ptr %p) { ; CHECK-O3-NEXT: movq (%rdi), %rax ; CHECK-O3-NEXT: movq %rax, %rcx ; CHECK-O3-NEXT: shrq $32, %rcx -; CHECK-O3-NEXT: orl %eax, %ecx +; CHECK-O3-NEXT: orb %al, %cl ; CHECK-O3-NEXT: movzbl %cl, %eax ; CHECK-O3-NEXT: retq %v = load atomic i64, ptr %p unordered, align 8 diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll index 23b46ee59154f..dadf71534ff58 100644 --- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll +++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll @@ -931,7 +931,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL-NEXT: kshiftrw $13, %k0, %k1 ; KNL-NEXT: kmovw %k1, %r13d ; KNL-NEXT: kshiftrw $14, %k0, %k1 -; KNL-NEXT: andl $1, %edx +; KNL-NEXT: andb $1, %dl ; KNL-NEXT: movb %dl, 2(%rax) ; KNL-NEXT: kmovw %k0, %edx ; KNL-NEXT: andl $1, %edx @@ -1244,7 +1244,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; SKX-NEXT: kshiftrd $13, %k0, %k1 ; SKX-NEXT: kmovd %k1, %r13d ; SKX-NEXT: kshiftrd $14, %k0, %k1 -; SKX-NEXT: andl $1, %edx +; SKX-NEXT: andb $1, %dl ; SKX-NEXT: movb %dl, 2(%rax) ; SKX-NEXT: kmovd %k0, %edx ; SKX-NEXT: andl $1, %edx @@ -1550,7 +1550,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; KNL_X32-NEXT: kshiftrw $5, %k0, %k1 ; KNL_X32-NEXT: kmovw %k1, %ecx ; KNL_X32-NEXT: kshiftrw $6, %k0, %k1 -; KNL_X32-NEXT: andl $1, %ebx +; KNL_X32-NEXT: andb $1, %bl ; KNL_X32-NEXT: movb %bl, 2(%eax) ; KNL_X32-NEXT: kmovw %k0, %ebx ; KNL_X32-NEXT: andl $1, %ebx @@ -1878,7 +1878,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind { ; FASTISEL-NEXT: kshiftrd $13, %k0, %k1 ; FASTISEL-NEXT: kmovd %k1, %r13d ; FASTISEL-NEXT: kshiftrd $14, %k0, %k1 -; FASTISEL-NEXT: andl $1, %edx +; FASTISEL-NEXT: andb $1, %dl ; FASTISEL-NEXT: movb %dl, 2(%rax) ; FASTISEL-NEXT: kmovd %k0, %edx ; FASTISEL-NEXT: andl $1, %edx diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll index 926af4e9957af..95de30335572e 100644 --- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll @@ -4548,17 +4548,17 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; X64-NEXT: kmovw %k0, %esi ; X64-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax -; X64-NEXT: orl %ecx, %edx -; X64-NEXT: orl %esi, %eax -; X64-NEXT: orl %edx, %eax +; X64-NEXT: orb %cl, %dl +; X64-NEXT: orb %sil, %al +; X64-NEXT: orb %dl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all: ; X86: # %bb.0: -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vcmplesd %xmm1, %xmm0, %k0 @@ -4566,14 +4566,14 @@ define i8@test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1 ; X86-NEXT: vcmpunordsd {sae}, %xmm1, %xmm0, %k0 ; X86-NEXT: kmovw %k0, %edx ; X86-NEXT: vcmpneqsd %xmm1, %xmm0, %k0 {%k1} -; X86-NEXT: kmovw %k0, %esi +; X86-NEXT: kmovw %k0, %ebx ; X86-NEXT: vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %esi, %eax -; X86-NEXT: orl %edx, %eax +; X86-NEXT: orb %cl, %dl +; X86-NEXT: orb %bl, %al +; X86-NEXT: orb %dl, %al ; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl @@ -4625,17 +4625,17 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; X64-NEXT: kmovw %k0, %esi ; X64-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; X64-NEXT: kmovw %k0, %eax -; X64-NEXT: andl %ecx, %edx -; X64-NEXT: andl %esi, %eax -; X64-NEXT: andl %edx, %eax +; X64-NEXT: andb %cl, %dl +; X64-NEXT: andb %sil, %al +; X64-NEXT: andb %dl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all: ; X86: # %bb.0: -; X86-NEXT: pushl %esi +; X86-NEXT: pushl %ebx ; X86-NEXT: .cfi_def_cfa_offset 8 -; X86-NEXT: .cfi_offset %esi, -8 +; X86-NEXT: .cfi_offset %ebx, -8 ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: kmovw %eax, %k1 ; X86-NEXT: vcmpless %xmm1, %xmm0, %k0 @@ -4643,14 +4643,14 @@ define i8@test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1, ; X86-NEXT: vcmpunordss {sae}, %xmm1, %xmm0, %k0 ; X86-NEXT: kmovw %k0, %edx ; X86-NEXT: vcmpneqss %xmm1, %xmm0, %k0 {%k1} -; X86-NEXT: kmovw %k0, %esi +; X86-NEXT: kmovw %k0, %ebx ; X86-NEXT: vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1} ; X86-NEXT: kmovw %k0, %eax -; X86-NEXT: andl %ecx, %edx -; X86-NEXT: andl %esi, %eax -; X86-NEXT: andl %edx, %eax +; X86-NEXT: andb %cl, %dl +; X86-NEXT: andb %bl, %al +; X86-NEXT: andb %dl, %al ; X86-NEXT: # kill: def $al killed $al killed $eax -; X86-NEXT: popl %esi +; X86-NEXT: popl %ebx ; X86-NEXT: .cfi_def_cfa_offset 4 ; X86-NEXT: retl %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4) diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll index 8aa898f3ec576..b8229d218dce6 100644 --- a/llvm/test/CodeGen/X86/avx512-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll @@ -1952,15 +1952,15 @@ declare void @f2(i32) #1 define void @store_i16_i1(i16 %x, ptr%y) { ; CHECK-LABEL: store_i16_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; ; X86-LABEL: store_i16_i1: ; X86: ## %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $1, %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: andb $1, %cl ; X86-NEXT: movb %cl, (%eax) ; X86-NEXT: retl %c = trunc i16 %x to i1 @@ -1971,7 +1971,7 @@ define void @store_i16_i1(i16 %x, ptr%y) { define void @store_i8_i1(i8 %x, ptr%y) { ; CHECK-LABEL: store_i8_i1: ; CHECK: ## %bb.0: -; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: andb $1, %dil ; CHECK-NEXT: movb %dil, (%rsi) ; CHECK-NEXT: retq ; @@ -3936,7 +3936,7 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) { ; CHECK-LABEL: test_v8i1_add: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: xorb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; @@ -3956,7 +3956,7 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) { ; CHECK-LABEL: test_v8i1_sub: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: xorb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; @@ -3976,7 +3976,7 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) { ; CHECK-LABEL: test_v8i1_mul: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: andb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; @@ -5128,7 +5128,7 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) { ; CHECK-LABEL: test_v1i1_add: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: xorb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; @@ -5148,7 +5148,7 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) { ; CHECK-LABEL: test_v1i1_sub: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: xorb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; @@ -5168,7 +5168,7 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) { ; CHECK-LABEL: test_v1i1_mul: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: andl %esi, %eax +; CHECK-NEXT: andb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; @@ -5188,15 +5188,14 @@ define <1 x i1> @uadd_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind { ; CHECK-LABEL: uadd_sat_v1i1: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: orb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; ; X86-LABEL: uadd_sat_v1i1: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: ## kill: def $al killed $al killed $eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orb {{[0-9]+}}(%esp), %al ; X86-NEXT: retl %z = call <1 x i1> @llvm.uadd.sat.v1i1(<1 x i1> %x, <1 x i1> %y) ret <1 x i1> %z @@ -5257,15 +5256,14 @@ define <1 x i1> @sadd_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind { ; CHECK-LABEL: sadd_sat_v1i1: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: orb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq ; ; X86-LABEL: sadd_sat_v1i1: ; X86: ## %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: orl {{[0-9]+}}(%esp), %eax -; X86-NEXT: ## kill: def $al killed $al killed $eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: orb {{[0-9]+}}(%esp), %al ; X86-NEXT: retl %z = call <1 x i1> @llvm.sadd.sat.v1i1(<1 x i1> %x, <1 x i1> %y) ret <1 x i1> %z diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll index 721ffbe1ceb79..0233f46fd2618 100644 --- a/llvm/test/CodeGen/X86/avx512-select.ll +++ b/llvm/test/CodeGen/X86/avx512-select.ll @@ -137,8 +137,8 @@ define i8 @select05(i8 %a.0, i8 %m) { ; ; X64-LABEL: select05: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: orl %esi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: orb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %mask = bitcast i8 %m to <8 x i1> @@ -212,8 +212,8 @@ define i8 @select06(i8 %a.0, i8 %m) { ; ; X64-LABEL: select06: ; X64: # %bb.0: -; X64-NEXT: movl %edi, %eax -; X64-NEXT: andl %esi, %eax +; X64-NEXT: movl %esi, %eax +; X64-NEXT: andb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %mask = bitcast i8 %m to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll index 041a86aff53fb..17502f4a506f1 100644 --- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll +++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll @@ -33,7 +33,7 @@ define i8 @mand8(i8 %x, i8 %y) { ; CHECK-LABEL: mand8: ; CHECK: ## %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: orl %esi, %eax +; CHECK-NEXT: orb %sil, %al ; CHECK-NEXT: ## kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %ma = bitcast i8 %x to <8 x i1> diff --git a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll index 627a94799424c..743e2e15ba8db 100644 --- a/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/avx512fp16-intrinsics.ll @@ -697,9 +697,9 @@ define i8 @test_int_x86_avx512_mask_cmp_sh_all(<8 x half> %x0, <8 x half> %x1, i ; CHECK-NEXT: kmovd %k0, %esi ; CHECK-NEXT: vcmpnltsh {sae}, %xmm1, %xmm0, %k0 {%k1} ; CHECK-NEXT: kmovd %k0, %eax -; CHECK-NEXT: andl %ecx, %edx -; CHECK-NEXT: andl %esi, %eax -; CHECK-NEXT: andl %edx, %eax +; CHECK-NEXT: andb %cl, %dl +; CHECK-NEXT: andb %sil, %al +; CHECK-NEXT: andb %dl, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %res1 = call i8 @llvm.x86.avx512fp16.mask.cmp.sh(<8 x half> %x0, <8 x half> %x1, i32 2, i8 -1, i32 4) diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll index 1f21fcac8341d..fb3db5725d6b6 100644 --- a/llvm/test/CodeGen/X86/bit_ceil.ll +++ b/llvm/test/CodeGen/X86/bit_ceil.ll @@ -12,8 +12,8 @@ define i32 @bit_ceil_i32(i32 %x) { ; NOBMI-NEXT: leal -1(%rdi), %eax ; NOBMI-NEXT: movl $63, %ecx ; NOBMI-NEXT: bsrl %eax, %ecx -; NOBMI-NEXT: xorl $31, %ecx -; NOBMI-NEXT: negb %cl +; NOBMI-NEXT: xorb $-32, %cl +; NOBMI-NEXT: addb $33, %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax ; NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx @@ -48,8 +48,8 @@ define i32 @bit_ceil_i32_plus1(i32 noundef %x) { ; NOBMI: # %bb.0: # %entry ; NOBMI-NEXT: movl $63, %ecx ; NOBMI-NEXT: bsrl %edi, %ecx -; NOBMI-NEXT: xorl $31, %ecx -; NOBMI-NEXT: negb %cl +; NOBMI-NEXT: xorb $-32, %cl +; NOBMI-NEXT: addb $33, %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax ; NOBMI-NEXT: # kill: def $cl killed $cl killed $ecx @@ -86,8 +86,8 @@ define i64 @bit_ceil_i64(i64 %x) { ; NOBMI-NEXT: leaq -1(%rdi), %rax ; NOBMI-NEXT: movl $127, %ecx ; NOBMI-NEXT: bsrq %rax, %rcx -; NOBMI-NEXT: xorl $63, %ecx -; NOBMI-NEXT: negb %cl +; NOBMI-NEXT: xorb $-64, %cl +; NOBMI-NEXT: addb $65, %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax ; NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx @@ -121,8 +121,8 @@ define i64 @bit_ceil_i64_plus1(i64 noundef %x) { ; NOBMI: # %bb.0: # %entry ; NOBMI-NEXT: movl $127, %ecx ; NOBMI-NEXT: bsrq %rdi, %rcx -; NOBMI-NEXT: xorl $63, %ecx -; NOBMI-NEXT: negb %cl +; NOBMI-NEXT: xorb $-64, %cl +; NOBMI-NEXT: addb $65, %cl ; NOBMI-NEXT: movl $1, %edx ; NOBMI-NEXT: movl $1, %eax ; NOBMI-NEXT: # kill: def $cl killed $cl killed $rcx diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll index 234c7a0a500d3..d81a81ee93c6e 100644 --- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll +++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll @@ -873,7 +873,7 @@ define i8 @v4i32_concat_undef(<4 x i32> %vec) { ; SSE2-SSSE3-NEXT: pxor %xmm1, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqd %xmm0, %xmm1 ; SSE2-SSSE3-NEXT: movmskps %xmm1, %eax -; SSE2-SSSE3-NEXT: xorl $15, %eax +; SSE2-SSSE3-NEXT: xorb $15, %al ; SSE2-SSSE3-NEXT: # kill: def $al killed $al killed $eax ; SSE2-SSSE3-NEXT: retq ; @@ -882,7 +882,7 @@ define i8 @v4i32_concat_undef(<4 x i32> %vec) { ; AVX12-NEXT: vpxor %xmm1, %xmm1, %xmm1 ; AVX12-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0 ; AVX12-NEXT: vmovmskps %xmm0, %eax -; AVX12-NEXT: xorl $15, %eax +; AVX12-NEXT: xorb $15, %al ; AVX12-NEXT: # kill: def $al killed $al killed $eax ; AVX12-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll index 2922113b14ea9..347cdf3952447 100644 --- a/llvm/test/CodeGen/X86/bitselect.ll +++ b/llvm/test/CodeGen/X86/bitselect.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI ; PR46472 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m)) @@ -17,14 +17,24 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind { ; X86-NEXT: xorb %cl, %al ; X86-NEXT: retl ; -; X64-LABEL: bitselect_i8: -; X64: # %bb.0: -; X64-NEXT: andl %edx, %esi -; X64-NEXT: movl %edx, %eax -; X64-NEXT: notb %al -; X64-NEXT: andb %dil, %al -; X64-NEXT: orb %sil, %al -; X64-NEXT: retq +; X64-NOBMI-LABEL: bitselect_i8: +; X64-NOBMI: # %bb.0: +; X64-NOBMI-NEXT: movl %esi, %eax +; X64-NOBMI-NEXT: xorb %dil, %al +; X64-NOBMI-NEXT: andb %dl, %al +; X64-NOBMI-NEXT: xorb %dil, %al +; X64-NOBMI-NEXT: # kill: def $al killed $al killed $eax +; X64-NOBMI-NEXT: retq +; +; X64-BMI-LABEL: bitselect_i8: +; X64-BMI: # %bb.0: +; X64-BMI-NEXT: movl %edx, %eax +; X64-BMI-NEXT: andb %al, %sil +; X64-BMI-NEXT: notb %al +; X64-BMI-NEXT: andb %dil, %al +; X64-BMI-NEXT: orb %sil, %al +; X64-BMI-NEXT: # kill: def $al killed $al killed $eax +; X64-BMI-NEXT: retq %not = xor i8 %m, -1 %ma = and i8 %a, %not %mb = and i8 %b, %m diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll index b73af677bc6cd..dd8511f3a8152 100644 --- a/llvm/test/CodeGen/X86/bool-math.ll +++ b/llvm/test/CodeGen/X86/bool-math.ll @@ -262,19 +262,20 @@ define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) { define i1 @opaque_constant(i48 %x, i48 %y) { ; X64-LABEL: opaque_constant: ; X64: # %bb.0: -; X64-NEXT: movq %rdi, %rax -; X64-NEXT: xorq %rsi, %rax +; X64-NEXT: movq %rsi, %rax +; X64-NEXT: shrq $32, %rdi ; X64-NEXT: shrq $32, %rax -; X64-NEXT: andl $1, %eax +; X64-NEXT: xorb %dil, %al +; X64-NEXT: andb $1, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq ; ; X32-LABEL: opaque_constant: ; X32: # %bb.0: -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: xorl {{[0-9]+}}(%esp), %eax -; X32-NEXT: andl $1, %eax -; X32-NEXT: # kill: def $al killed $al killed $eax +; X32-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X32-NEXT: movl $1, %ecx +; X32-NEXT: xorb {{[0-9]+}}(%esp), %al +; X32-NEXT: andb %cl, %al ; X32-NEXT: retl %andx = and i48 %x, 4294967296 %andy = and i48 %y, 4294967296 diff --git a/llvm/test/CodeGen/X86/cmp-bool.ll b/llvm/test/CodeGen/X86/cmp-bool.ll index 617b485e0de0f..5ba3680c1b74d 100644 --- a/llvm/test/CodeGen/X86/cmp-bool.ll +++ b/llvm/test/CodeGen/X86/cmp-bool.ll @@ -4,7 +4,7 @@ define void @bool_eq(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind { ; CHECK-LABEL: bool_eq: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: cmpb %sil, %dil ; CHECK-NEXT: je .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.end ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll index 5e030de1409f2..816f16667734c 100644 --- a/llvm/test/CodeGen/X86/cmp-concat.ll +++ b/llvm/test/CodeGen/X86/cmp-concat.ll @@ -4,7 +4,7 @@ define i1 @cmp_allbits_concat_i8(i8 %x, i8 %y) { ; CHECK-LABEL: cmp_allbits_concat_i8: ; CHECK: # %bb.0: -; CHECK-NEXT: andl %esi, %edi +; CHECK-NEXT: andb %sil, %dil ; CHECK-NEXT: cmpb $-1, %dil ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll index 752f6659948e6..653508c73771f 100644 --- a/llvm/test/CodeGen/X86/ctlo.ll +++ b/llvm/test/CodeGen/X86/ctlo.ll @@ -21,7 +21,7 @@ define i8 @ctlo_i8(i8 %x) { ; X86-NOCMOV-NEXT: # %bb.2: # %cond.false ; X86-NOCMOV-NEXT: movzbl %al, %eax ; X86-NOCMOV-NEXT: bsrl %eax, %eax -; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: xorb $7, %al ; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; X86-NOCMOV-NEXT: retl ; X86-NOCMOV-NEXT: .LBB0_1: @@ -37,7 +37,7 @@ define i8 @ctlo_i8(i8 %x) { ; X86-CMOV-NEXT: bsrl %eax, %ecx ; X86-CMOV-NEXT: movl $15, %eax ; X86-CMOV-NEXT: cmovnel %ecx, %eax -; X86-CMOV-NEXT: xorl $7, %eax +; X86-CMOV-NEXT: xorb $7, %al ; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax ; X86-CMOV-NEXT: retl ; @@ -47,24 +47,26 @@ define i8 @ctlo_i8(i8 %x) { ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: movl $15, %eax ; X64-NEXT: bsrl %ecx, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; ; X86-CLZ-LABEL: ctlo_i8: ; X86-CLZ: # %bb.0: -; X86-CLZ-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-CLZ-NEXT: shll $24, %eax -; X86-CLZ-NEXT: notl %eax +; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-CLZ-NEXT: notb %al +; X86-CLZ-NEXT: movzbl %al, %eax ; X86-CLZ-NEXT: lzcntl %eax, %eax +; X86-CLZ-NEXT: addb $-24, %al ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; ; X64-CLZ-LABEL: ctlo_i8: ; X64-CLZ: # %bb.0: -; X64-CLZ-NEXT: shll $24, %edi -; X64-CLZ-NEXT: notl %edi -; X64-CLZ-NEXT: lzcntl %edi, %eax +; X64-CLZ-NEXT: notb %dil +; X64-CLZ-NEXT: movzbl %dil, %eax +; X64-CLZ-NEXT: lzcntl %eax, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq %tmp1 = xor i8 %x, -1 @@ -79,7 +81,7 @@ define i8 @ctlo_i8_undef(i8 %x) { ; X86-NEXT: notb %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax +; X86-NEXT: xorb $7, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -88,7 +90,7 @@ define i8 @ctlo_i8_undef(i8 %x) { ; X64-NEXT: notb %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll index 1267fe9033454..c1b5eae74e50f 100644 --- a/llvm/test/CodeGen/X86/ctlz.ll +++ b/llvm/test/CodeGen/X86/ctlz.ll @@ -17,7 +17,7 @@ define i8 @ctlz_i8(i8 %x) { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax +; X86-NEXT: xorb $7, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -25,7 +25,7 @@ define i8 @ctlz_i8(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -226,7 +226,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X86-NOCMOV-NEXT: # %bb.2: # %cond.false ; X86-NOCMOV-NEXT: movzbl %al, %eax ; X86-NOCMOV-NEXT: bsrl %eax, %eax -; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: xorb $7, %al ; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; X86-NOCMOV-NEXT: retl ; X86-NOCMOV-NEXT: .LBB4_1: @@ -240,7 +240,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X86-CMOV-NEXT: bsrl %eax, %ecx ; X86-CMOV-NEXT: movl $15, %eax ; X86-CMOV-NEXT: cmovnel %ecx, %eax -; X86-CMOV-NEXT: xorl $7, %eax +; X86-CMOV-NEXT: xorb $7, %al ; X86-CMOV-NEXT: # kill: def $al killed $al killed $eax ; X86-CMOV-NEXT: retl ; @@ -249,7 +249,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X64-NEXT: movzbl %dil, %ecx ; X64-NEXT: movl $15, %eax ; X64-NEXT: bsrl %ecx, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -257,7 +257,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax +; X86-CLZ-NEXT: addb $-24, %al ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl ; @@ -265,7 +265,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq ; @@ -273,7 +273,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X64-FASTLZCNT: # %bb.0: ; X64-FASTLZCNT-NEXT: movzbl %dil, %eax ; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax -; X64-FASTLZCNT-NEXT: addl $-24, %eax +; X64-FASTLZCNT-NEXT: addb $-24, %al ; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax ; X64-FASTLZCNT-NEXT: retq ; @@ -281,7 +281,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) { ; X86-FASTLZCNT: # %bb.0: ; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax -; X86-FASTLZCNT-NEXT: addl $-24, %eax +; X86-FASTLZCNT-NEXT: addb $-24, %al ; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-FASTLZCNT-NEXT: retl %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false) @@ -654,7 +654,7 @@ define i8 @ctlz_i8_knownbits(i8 %x) { ; X86-NEXT: orb $64, %al ; X86-NEXT: movzbl %al, %eax ; X86-NEXT: bsrl %eax, %eax -; X86-NEXT: xorl $7, %eax +; X86-NEXT: xorb $7, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -663,7 +663,7 @@ define i8 @ctlz_i8_knownbits(i8 %x) { ; X64-NEXT: orb $64, %dil ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: bsrl %eax, %eax -; X64-NEXT: xorl $7, %eax +; X64-NEXT: xorb $7, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -963,7 +963,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; X86-NOCMOV-NEXT: # %bb.2: # %cond.false ; X86-NOCMOV-NEXT: movzbl %al, %eax ; X86-NOCMOV-NEXT: bsrl %eax, %eax -; X86-NOCMOV-NEXT: xorl $7, %eax +; X86-NOCMOV-NEXT: xorb $7, %al ; X86-NOCMOV-NEXT: xorb $7, %al ; X86-NOCMOV-NEXT: # kill: def $al killed $al killed $eax ; X86-NOCMOV-NEXT: retl @@ -994,7 +994,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; X86-CLZ: # %bb.0: ; X86-CLZ-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-CLZ-NEXT: lzcntl %eax, %eax -; X86-CLZ-NEXT: addl $-24, %eax +; X86-CLZ-NEXT: addb $-24, %al ; X86-CLZ-NEXT: xorb $7, %al ; X86-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X86-CLZ-NEXT: retl @@ -1003,7 +1003,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; X64-CLZ: # %bb.0: ; X64-CLZ-NEXT: movzbl %dil, %eax ; X64-CLZ-NEXT: lzcntl %eax, %eax -; X64-CLZ-NEXT: addl $-24, %eax +; X64-CLZ-NEXT: addb $-24, %al ; X64-CLZ-NEXT: xorb $7, %al ; X64-CLZ-NEXT: # kill: def $al killed $al killed $eax ; X64-CLZ-NEXT: retq @@ -1012,7 +1012,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; X64-FASTLZCNT: # %bb.0: ; X64-FASTLZCNT-NEXT: movzbl %dil, %eax ; X64-FASTLZCNT-NEXT: lzcntl %eax, %eax -; X64-FASTLZCNT-NEXT: addl $-24, %eax +; X64-FASTLZCNT-NEXT: addb $-24, %al ; X64-FASTLZCNT-NEXT: xorb $7, %al ; X64-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax ; X64-FASTLZCNT-NEXT: retq @@ -1021,7 +1021,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) { ; X86-FASTLZCNT: # %bb.0: ; X86-FASTLZCNT-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-FASTLZCNT-NEXT: lzcntl %eax, %eax -; X86-FASTLZCNT-NEXT: addl $-24, %eax +; X86-FASTLZCNT-NEXT: addb $-24, %al ; X86-FASTLZCNT-NEXT: xorb $7, %al ; X86-FASTLZCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-FASTLZCNT-NEXT: retl diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll index a43dba94d30c7..4fbc59e29d304 100644 --- a/llvm/test/CodeGen/X86/ctpop-mask.ll +++ b/llvm/test/CodeGen/X86/ctpop-mask.ll @@ -148,10 +148,9 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone { ; ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask3: ; X86-NO-POPCOUNT: # %bb.0: -; X86-NO-POPCOUNT-NEXT: movzwl {{[0-9]+}}(%esp), %ecx -; X86-NO-POPCOUNT-NEXT: andl $14, %ecx +; X86-NO-POPCOUNT-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NO-POPCOUNT-NEXT: andb $14, %cl ; X86-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994 -; X86-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-POPCOUNT-NEXT: shrl %cl, %eax ; X86-NO-POPCOUNT-NEXT: andl $3, %eax ; X86-NO-POPCOUNT-NEXT: # kill: def $ax killed $ax killed $eax @@ -160,7 +159,7 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone { ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask3: ; X64-NO-POPCOUNT: # %bb.0: ; X64-NO-POPCOUNT-NEXT: movl %edi, %ecx -; X64-NO-POPCOUNT-NEXT: andl $14, %ecx +; X64-NO-POPCOUNT-NEXT: andb $14, %cl ; X64-NO-POPCOUNT-NEXT: movl $59796, %eax # imm = 0xE994 ; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-POPCOUNT-NEXT: shrl %cl, %eax @@ -243,7 +242,7 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone { ; X64-NO-POPCOUNT: # %bb.0: ; X64-NO-POPCOUNT-NEXT: movl %edi, %ecx ; X64-NO-POPCOUNT-NEXT: shrl $7, %ecx -; X64-NO-POPCOUNT-NEXT: andl $60, %ecx +; X64-NO-POPCOUNT-NEXT: andb $60, %cl ; X64-NO-POPCOUNT-NEXT: movabsq $4841987667533046032, %rax # imm = 0x4332322132212110 ; X64-NO-POPCOUNT-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-POPCOUNT-NEXT: shrq %cl, %rax diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index bf6b09674e187..fb363a96fe6ae 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -24,25 +24,25 @@ define void @_Z1nv() local_unnamed_addr { ; CHECK-LABEL: _Z1nv: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movq k@GOTPCREL(%rip), %rax -; CHECK-NEXT: movl 4(%rax), %edx +; CHECK-NEXT: movl 4(%rax), %ecx ; CHECK-NEXT: movq c@GOTPCREL(%rip), %rax -; CHECK-NEXT: movswl (%rax), %ecx +; CHECK-NEXT: movswl (%rax), %esi ; CHECK-NEXT: movq b@GOTPCREL(%rip), %rax ; CHECK-NEXT: movswl (%rax), %edi -; CHECK-NEXT: movq a@GOTPCREL(%rip), %rsi -; CHECK-NEXT: movl (%rsi), %esi +; CHECK-NEXT: movq a@GOTPCREL(%rip), %rdx +; CHECK-NEXT: movl (%rdx), %edx ; CHECK-NEXT: movq l@GOTPCREL(%rip), %r8 ; CHECK-NEXT: movl (%r8), %r8d ; CHECK-NEXT: movl %r8d, %r9d ; CHECK-NEXT: shll $7, %r9d ; CHECK-NEXT: sarl $7, %r9d ; CHECK-NEXT: negl %r9d -; CHECK-NEXT: testl %esi, %esi -; CHECK-NEXT: cmovel %esi, %r9d -; CHECK-NEXT: movzwl %dx, %r10d -; CHECK-NEXT: leal (%rcx,%r10,2), %ecx -; CHECK-NEXT: addl %edi, %ecx -; CHECK-NEXT: cmpl %r9d, %ecx +; CHECK-NEXT: testl %edx, %edx +; CHECK-NEXT: cmovel %edx, %r9d +; CHECK-NEXT: movzwl %cx, %r10d +; CHECK-NEXT: leal (%rsi,%r10,2), %esi +; CHECK-NEXT: addl %edi, %esi +; CHECK-NEXT: cmpl %r9d, %esi ; CHECK-NEXT: sete %dil ; CHECK-NEXT: testl $33554431, %r8d # imm = 0x1FFFFFF ; CHECK-NEXT: sete %r8b @@ -50,12 +50,12 @@ define void @_Z1nv() local_unnamed_addr { ; CHECK-NEXT: movzbl %r8b, %edi ; CHECK-NEXT: movq e@GOTPCREL(%rip), %r8 ; CHECK-NEXT: movw %di, (%r8) -; CHECK-NEXT: notl %ecx -; CHECK-NEXT: shrl $31, %ecx -; CHECK-NEXT: addl %edx, %ecx +; CHECK-NEXT: notl %esi +; CHECK-NEXT: shrl $31, %esi +; CHECK-NEXT: addb %sil, %cl ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx -; CHECK-NEXT: sarl %cl, %esi -; CHECK-NEXT: movw %si, (%rax) +; CHECK-NEXT: sarl %cl, %edx +; CHECK-NEXT: movw %dx, (%rax) ; CHECK-NEXT: retq entry: %bf.load = load i32, ptr getelementptr inbounds (%struct.m, ptr @k, i64 0, i32 0, i32 1), align 4 diff --git a/llvm/test/CodeGen/X86/fast-isel-fcmp.ll b/llvm/test/CodeGen/X86/fast-isel-fcmp.ll index b9ef3154cd1c3..b56e2c0faa439 100644 --- a/llvm/test/CodeGen/X86/fast-isel-fcmp.ll +++ b/llvm/test/CodeGen/X86/fast-isel-fcmp.ll @@ -9,7 +9,7 @@ define zeroext i1 @fcmp_oeq(float %x, float %y) { ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpeqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -327,7 +327,7 @@ define zeroext i1 @fcmp_une(float %x, float %y) { ; SDAG: ## %bb.0: ; SDAG-NEXT: cmpneqss %xmm1, %xmm0 ; SDAG-NEXT: movd %xmm0, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -553,7 +553,7 @@ define zeroext i1 @fcmp_oeq3(float %x) { ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpeqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; @@ -1165,7 +1165,7 @@ define zeroext i1 @fcmp_une3(float %x) { ; SDAG-NEXT: xorps %xmm1, %xmm1 ; SDAG-NEXT: cmpneqss %xmm0, %xmm1 ; SDAG-NEXT: movd %xmm1, %eax -; SDAG-NEXT: andl $1, %eax +; SDAG-NEXT: andb $1, %al ; SDAG-NEXT: ## kill: def $al killed $al killed $eax ; SDAG-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/fold-select.ll b/llvm/test/CodeGen/X86/fold-select.ll index 31afe979a33b3..cc9a525ba1f39 100644 --- a/llvm/test/CodeGen/X86/fold-select.ll +++ b/llvm/test/CodeGen/X86/fold-select.ll @@ -19,8 +19,8 @@ define <8 x float> @select_and_v8i1(<8 x i1> %a, <8 x i1> %b, <8 x i1> %c, <8 x define <8 x float> @select_and_v8i1_2(i8 %m1, i8 %m2, i8 %m3, <8 x float> %d) { ; CHECK-LABEL: select_and_v8i1_2: ; CHECK: # %bb.0: -; CHECK-NEXT: orl %esi, %edi -; CHECK-NEXT: andl %edx, %edi +; CHECK-NEXT: orb %sil, %dil +; CHECK-NEXT: andb %dl, %dil ; CHECK-NEXT: kmovd %edi, %k1 ; CHECK-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0] ; CHECK-NEXT: vmovaps %ymm0, %ymm1 {%k1} diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll index c79e19f07cda5..42a753cc4ee4c 100644 --- a/llvm/test/CodeGen/X86/fpenv.ll +++ b/llvm/test/CodeGen/X86/fpenv.ll @@ -185,8 +185,9 @@ define void @func_05(i32 %x) nounwind { ; X86-NOSSE-LABEL: func_05: ; X86-NOSSE: # %bb.0: ; X86-NOSSE-NEXT: pushl %eax -; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NOSSE-NEXT: leal 4(%eax,%eax), %ecx +; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NOSSE-NEXT: addl %ecx, %ecx +; X86-NOSSE-NEXT: addb $4, %cl ; X86-NOSSE-NEXT: movl $201, %eax ; X86-NOSSE-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NOSSE-NEXT: shll %cl, %eax @@ -203,8 +204,9 @@ define void @func_05(i32 %x) nounwind { ; X86-SSE-LABEL: func_05: ; X86-SSE: # %bb.0: ; X86-SSE-NEXT: pushl %eax -; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE-NEXT: leal 4(%eax,%eax), %ecx +; X86-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SSE-NEXT: addl %ecx, %ecx +; X86-SSE-NEXT: addb $4, %cl ; X86-SSE-NEXT: movl $201, %eax ; X86-SSE-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SSE-NEXT: shll %cl, %eax @@ -227,7 +229,8 @@ define void @func_05(i32 %x) nounwind { ; X64-LABEL: func_05: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal 4(%rdi,%rdi), %ecx +; X64-NEXT: leal (%rdi,%rdi), %ecx +; X64-NEXT: addb $4, %cl ; X64-NEXT: movl $201, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shll %cl, %eax diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll index 85f4c945230e1..23069b88babec 100644 --- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll @@ -763,7 +763,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -816,7 +816,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind { ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp @@ -1777,7 +1777,7 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -1830,7 +1830,7 @@ define i100 @test_signed_i100_f64(double %f) nounwind { ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp @@ -2957,7 +2957,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -3017,7 +3017,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind { ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp @@ -4288,7 +4288,7 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: movl %ebx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %eax, (%ecx) -; X86-X87-NEXT: andl $15, %edx +; X86-X87-NEXT: andb $15, %dl ; X86-X87-NEXT: movb %dl, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -4349,7 +4349,7 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: movl %edi, 8(%esi) ; X86-SSE-NEXT: movl %edx, 4(%esi) ; X86-SSE-NEXT: movl %ecx, (%esi) -; X86-SSE-NEXT: andl $15, %eax +; X86-SSE-NEXT: andb $15, %al ; X86-SSE-NEXT: movb %al, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $44, %esp diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll index 47dc3ca3616ea..4b06fd9540891 100644 --- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll +++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll @@ -720,7 +720,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind { ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -768,7 +768,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind { ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $32, %esp @@ -1644,7 +1644,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind { ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -1692,7 +1692,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind { ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $32, %esp @@ -2723,7 +2723,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $44, %esp @@ -2778,7 +2778,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind { ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $32, %esp @@ -3938,7 +3938,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind { ; X86-X87-NEXT: movl %edx, 8(%ecx) ; X86-X87-NEXT: movl %ebp, 4(%ecx) ; X86-X87-NEXT: movl %edi, (%ecx) -; X86-X87-NEXT: andl $15, %eax +; X86-X87-NEXT: andb $15, %al ; X86-X87-NEXT: movb %al, 12(%ecx) ; X86-X87-NEXT: movl %ecx, %eax ; X86-X87-NEXT: addl $60, %esp @@ -3992,7 +3992,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind { ; X86-SSE-NEXT: movl %eax, 8(%esi) ; X86-SSE-NEXT: movl %ecx, 4(%esi) ; X86-SSE-NEXT: movl %edx, (%esi) -; X86-SSE-NEXT: andl $15, %ebx +; X86-SSE-NEXT: andb $15, %bl ; X86-SSE-NEXT: movb %bl, 12(%esi) ; X86-SSE-NEXT: movl %esi, %eax ; X86-SSE-NEXT: addl $48, %esp diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll index a464d78f9af38..c9e9bda1dcd3b 100644 --- a/llvm/test/CodeGen/X86/funnel-shift.ll +++ b/llvm/test/CodeGen/X86/funnel-shift.ll @@ -182,8 +182,9 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X64-AVX-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX-NEXT: mulq %rdx ; X64-AVX-NEXT: leal (%rdx,%rdx,8), %eax -; X64-AVX-NEXT: leal (%rdx,%rax,4), %eax -; X64-AVX-NEXT: subl %eax, %ecx +; X64-AVX-NEXT: shll $2, %eax +; X64-AVX-NEXT: addb %dl, %al +; X64-AVX-NEXT: subb %al, %cl ; X64-AVX-NEXT: shlq $27, %rsi ; X64-AVX-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-AVX-NEXT: shldq %cl, %rsi, %rdi @@ -349,9 +350,10 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind { ; X64-AVX-NEXT: movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5 ; X64-AVX-NEXT: mulq %rdx ; X64-AVX-NEXT: leal (%rdx,%rdx,8), %eax -; X64-AVX-NEXT: leal (%rdx,%rax,4), %eax -; X64-AVX-NEXT: subl %eax, %ecx -; X64-AVX-NEXT: addl $27, %ecx +; X64-AVX-NEXT: shll $2, %eax +; X64-AVX-NEXT: addb %dl, %al +; X64-AVX-NEXT: subb %al, %cl +; X64-AVX-NEXT: addb $27, %cl ; X64-AVX-NEXT: shlq $27, %rsi ; X64-AVX-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-AVX-NEXT: shrdq %cl, %rdi, %rsi @@ -437,16 +439,15 @@ define i32 @fshl_i32_undef0_msk(i32 %a0, i32 %a1) nounwind { ; X86-SSE2-LABEL: fshl_i32_undef0_msk: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: andl $7, %ecx -; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: andb $7, %cl ; X86-SSE2-NEXT: shldl %cl, %eax, %eax ; X86-SSE2-NEXT: retl ; ; X64-AVX-LABEL: fshl_i32_undef0_msk: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movl %esi, %ecx -; X64-AVX-NEXT: andl $7, %ecx +; X64-AVX-NEXT: andb $7, %cl ; X64-AVX-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX-NEXT: shldl %cl, %edi, %eax ; X64-AVX-NEXT: retq @@ -694,16 +695,15 @@ define i32 @fshr_i32_undef1_msk(i32 %a0, i32 %a1) nounwind { ; X86-SSE2-LABEL: fshr_i32_undef1_msk: ; X86-SSE2: # %bb.0: ; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SSE2-NEXT: andl $7, %ecx -; X86-SSE2-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SSE2-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-SSE2-NEXT: andb $7, %cl ; X86-SSE2-NEXT: shrdl %cl, %eax, %eax ; X86-SSE2-NEXT: retl ; ; X64-AVX-LABEL: fshr_i32_undef1_msk: ; X64-AVX: # %bb.0: ; X64-AVX-NEXT: movl %esi, %ecx -; X64-AVX-NEXT: andl $7, %ecx +; X64-AVX-NEXT: andb $7, %cl ; X64-AVX-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-AVX-NEXT: shrdl %cl, %edi, %eax ; X64-AVX-NEXT: retq diff --git a/llvm/test/CodeGen/X86/isel-and.ll b/llvm/test/CodeGen/X86/isel-and.ll index 3fda0e1d86391..fc1356c237328 100644 --- a/llvm/test/CodeGen/X86/isel-and.ll +++ b/llvm/test/CodeGen/X86/isel-and.ll @@ -30,7 +30,7 @@ define i1 @and_i1(i1 %a, i1 %b) { ; SDAG-X64-LABEL: and_i1: ; SDAG-X64: # %bb.0: ; SDAG-X64-NEXT: movl %edi, %eax -; SDAG-X64-NEXT: andl %esi, %eax +; SDAG-X64-NEXT: andb %sil, %al ; SDAG-X64-NEXT: # kill: def $al killed $al killed $eax ; SDAG-X64-NEXT: retq ; @@ -74,7 +74,7 @@ define i8 @and_i8(i8 %a, i8 %b) { ; SDAG-X64-LABEL: and_i8: ; SDAG-X64: # %bb.0: ; SDAG-X64-NEXT: movl %edi, %eax -; SDAG-X64-NEXT: andl %esi, %eax +; SDAG-X64-NEXT: andb %sil, %al ; SDAG-X64-NEXT: # kill: def $al killed $al killed $eax ; SDAG-X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/isel-fcmp.ll b/llvm/test/CodeGen/X86/isel-fcmp.ll index 4a223aaa4149b..fe969e7b852c1 100644 --- a/llvm/test/CodeGen/X86/isel-fcmp.ll +++ b/llvm/test/CodeGen/X86/isel-fcmp.ll @@ -13,7 +13,7 @@ ; X64: ## %bb.0: ; X64-NEXT: cmpeqss %xmm1, %xmm0 ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: ## kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -418,7 +418,7 @@ ; X64: ## %bb.0: ; X64-NEXT: cmpneqss %xmm1, %xmm0 ; X64-NEXT: movd %xmm0, %eax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: ## kill: def $al killed $al killed $eax ; X64-NEXT: retq ; @@ -451,7 +451,7 @@ ; X64: ## %bb.0: ; X64-NEXT: cmpeqsd %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, %rax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: ## kill: def $al killed $al killed $rax ; X64-NEXT: retq ; @@ -856,7 +856,7 @@ ; X64: ## %bb.0: ; X64-NEXT: cmpneqsd %xmm1, %xmm0 ; X64-NEXT: movq %xmm0, %rax -; X64-NEXT: andl $1, %eax +; X64-NEXT: andb $1, %al ; X64-NEXT: ## kill: def $al killed $al killed $rax ; X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/isel-or.ll b/llvm/test/CodeGen/X86/isel-or.ll index 449f29a027743..ce76f798cda34 100644 --- a/llvm/test/CodeGen/X86/isel-or.ll +++ b/llvm/test/CodeGen/X86/isel-or.ll @@ -30,7 +30,7 @@ define i1 @or_i1(i1 %a, i1 %b) { ; SDAG-X64-LABEL: or_i1: ; SDAG-X64: # %bb.0: ; SDAG-X64-NEXT: movl %edi, %eax -; SDAG-X64-NEXT: orl %esi, %eax +; SDAG-X64-NEXT: orb %sil, %al ; SDAG-X64-NEXT: # kill: def $al killed $al killed $eax ; SDAG-X64-NEXT: retq ; @@ -75,7 +75,7 @@ define i8 @or_i8(i8 %a, i8 %b) { ; SDAG-X64-LABEL: or_i8: ; SDAG-X64: # %bb.0: ; SDAG-X64-NEXT: movl %edi, %eax -; SDAG-X64-NEXT: orl %esi, %eax +; SDAG-X64-NEXT: orb %sil, %al ; SDAG-X64-NEXT: # kill: def $al killed $al killed $eax ; SDAG-X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/isel-xor.ll b/llvm/test/CodeGen/X86/isel-xor.ll index a31ad78524ee1..73383b0c38841 100644 --- a/llvm/test/CodeGen/X86/isel-xor.ll +++ b/llvm/test/CodeGen/X86/isel-xor.ll @@ -30,7 +30,7 @@ define i1 @xor_i1(i1 %a, i1 %b) { ; SDAG-X64-LABEL: xor_i1: ; SDAG-X64: # %bb.0: ; SDAG-X64-NEXT: movl %edi, %eax -; SDAG-X64-NEXT: xorl %esi, %eax +; SDAG-X64-NEXT: xorb %sil, %al ; SDAG-X64-NEXT: # kill: def $al killed $al killed $eax ; SDAG-X64-NEXT: retq ; @@ -75,7 +75,7 @@ define i8 @xor_i8(i8 %a, i8 %b) { ; SDAG-X64-LABEL: xor_i8: ; SDAG-X64: # %bb.0: ; SDAG-X64-NEXT: movl %edi, %eax -; SDAG-X64-NEXT: xorl %esi, %eax +; SDAG-X64-NEXT: xorb %sil, %al ; SDAG-X64-NEXT: # kill: def $al killed $al killed $eax ; SDAG-X64-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll index d28a93ec3d77c..1e920d5223bb3 100644 --- a/llvm/test/CodeGen/X86/jump_sign.ll +++ b/llvm/test/CodeGen/X86/jump_sign.ll @@ -383,11 +383,10 @@ define i32 @func_test1(i32 %p1) nounwind uwtable { ; CHECK-LABEL: func_test1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl b, %eax -; CHECK-NEXT: xorl %ecx, %ecx ; CHECK-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; CHECK-NEXT: setb %cl ; CHECK-NEXT: movl a, %eax -; CHECK-NEXT: testl %eax, %ecx +; CHECK-NEXT: testb %al, %cl ; CHECK-NEXT: je .LBB18_2 ; CHECK-NEXT: # %bb.1: # %if.then ; CHECK-NEXT: decl %eax diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll index e183bbc15617d..3220a95a6134c 100644 --- a/llvm/test/CodeGen/X86/known-pow2.ll +++ b/llvm/test/CodeGen/X86/known-pow2.ll @@ -104,7 +104,7 @@ define i1 @pow2_srl(i32 %x, i32 %y) { ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shll %cl, %eax ; CHECK-NEXT: shrl $20, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %yy = and i32 %y, 7 diff --git a/llvm/test/CodeGen/X86/large-pic-string.ll b/llvm/test/CodeGen/X86/large-pic-string.ll index 5e7cdbb93dc88..2a2c46e501175 100644 --- a/llvm/test/CodeGen/X86/large-pic-string.ll +++ b/llvm/test/CodeGen/X86/large-pic-string.ll @@ -12,7 +12,7 @@ define void @pr38385() { ; CHECK-NEXT: movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx ; CHECK-NEXT: addq %rax, %rcx ; CHECK-NEXT: movabsq $.L.str@GOTOFF, %rax -; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: addb %al, %cl ; CHECK-NEXT: movb %cl, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: retq %p = alloca i8, align 1 diff --git a/llvm/test/CodeGen/X86/logic-shift.ll b/llvm/test/CodeGen/X86/logic-shift.ll index 96e63d1122ec9..7eac8534d06b2 100644 --- a/llvm/test/CodeGen/X86/logic-shift.ll +++ b/llvm/test/CodeGen/X86/logic-shift.ll @@ -6,7 +6,7 @@ define i8 @or_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: orb %sil, %dil ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, %dil ; CHECK-NEXT: orb %dil, %al @@ -178,7 +178,7 @@ define i8 @or_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: orl %esi, %edi +; CHECK-NEXT: orb %sil, %dil ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, %dil ; CHECK-NEXT: orb %dil, %al @@ -290,7 +290,7 @@ define i8 @xor_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: xorb %sil, %dil ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, %dil ; CHECK-NEXT: xorb %dil, %al @@ -462,7 +462,7 @@ define i8 @xor_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: xorb %sil, %dil ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, %dil ; CHECK-NEXT: xorb %dil, %al @@ -574,7 +574,7 @@ define i8 @and_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: andl %esi, %edi +; CHECK-NEXT: andb %sil, %dil ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shrb %cl, %dil ; CHECK-NEXT: andb %dil, %al @@ -746,7 +746,7 @@ define i8 @and_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) { ; CHECK: # %bb.0: ; CHECK-NEXT: movl %ecx, %eax ; CHECK-NEXT: movl %edx, %ecx -; CHECK-NEXT: andl %esi, %edi +; CHECK-NEXT: andb %sil, %dil ; CHECK-NEXT: # kill: def $cl killed $cl killed $ecx ; CHECK-NEXT: shlb %cl, %dil ; CHECK-NEXT: andb %dil, %al @@ -902,9 +902,8 @@ define i8 @or_fshl_commute3(i8 %x, i8 %y) { ; CHECK-LABEL: or_fshl_commute3: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: orl %edi, %esi ; CHECK-NEXT: shlb $5, %sil -; CHECK-NEXT: shrb $3, %al +; CHECK-NEXT: rolb $5, %al ; CHECK-NEXT: orb %sil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq @@ -979,10 +978,9 @@ define i16 @or_fshr_commute2(i16 %x, i16 %y) { define i8 @or_fshr_commute3(i8 %x, i8 %y) { ; CHECK-LABEL: or_fshr_commute3: ; CHECK: # %bb.0: -; CHECK-NEXT: # kill: def $edi killed $edi def $rdi -; CHECK-NEXT: orl %edi, %esi +; CHECK-NEXT: movl %edi, %eax ; CHECK-NEXT: shrb $6, %sil -; CHECK-NEXT: leal (,%rdi,4), %eax +; CHECK-NEXT: rolb $2, %al ; CHECK-NEXT: orb %sil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/lzcnt.ll b/llvm/test/CodeGen/X86/lzcnt.ll index b000401973416..3cdb9eae35470 100644 --- a/llvm/test/CodeGen/X86/lzcnt.ll +++ b/llvm/test/CodeGen/X86/lzcnt.ll @@ -13,7 +13,7 @@ define i8 @t1(i8 %x) nounwind { ; X86: # %bb.0: ; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: lzcntl %eax, %eax -; X86-NEXT: addl $-24, %eax +; X86-NEXT: addb $-24, %al ; X86-NEXT: # kill: def $al killed $al killed $eax ; X86-NEXT: retl ; @@ -21,7 +21,7 @@ define i8 @t1(i8 %x) nounwind { ; X32: # %bb.0: ; X32-NEXT: movzbl %dil, %eax ; X32-NEXT: lzcntl %eax, %eax -; X32-NEXT: addl $-24, %eax +; X32-NEXT: addb $-24, %al ; X32-NEXT: # kill: def $al killed $al killed $eax ; X32-NEXT: retq ; @@ -29,7 +29,7 @@ define i8 @t1(i8 %x) nounwind { ; X64: # %bb.0: ; X64-NEXT: movzbl %dil, %eax ; X64-NEXT: lzcntl %eax, %eax -; X64-NEXT: addl $-24, %eax +; X64-NEXT: addb $-24, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 false ) diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll index 2f0d419132492..4e499ad707389 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -18,7 +18,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB0_1 ; SSE2-NEXT: # %bb.2: # %else @@ -91,7 +91,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB0_1 ; SSE4-NEXT: # %bb.2: # %else @@ -222,7 +222,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %else @@ -303,7 +303,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB1_1 ; SSE4-NEXT: # %bb.2: # %else @@ -377,7 +377,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -448,7 +448,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -611,7 +611,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB2_1 @@ -687,7 +687,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB2_1 ; SSE4-NEXT: # %bb.2: # %else @@ -762,7 +762,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -835,7 +835,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -987,7 +987,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1026,7 +1026,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1126,7 +1126,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1169,7 +1169,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packusdw %xmm3, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1209,7 +1209,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1251,7 +1251,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1357,7 +1357,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packuswb %xmm3, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -1400,7 +1400,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE4-NEXT: movmskps %xmm3, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1440,7 +1440,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1482,7 +1482,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1585,7 +1585,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1608,7 +1608,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1689,7 +1689,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1714,7 +1714,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1737,7 +1737,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -1812,7 +1812,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -1835,7 +1835,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm2 ; SSE4-NEXT: movmskpd %xmm2, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1857,7 +1857,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -3280,7 +3280,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB11_1 ; SSE2-NEXT: # %bb.2: # %else @@ -3357,7 +3357,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB11_1 ; SSE4-NEXT: # %bb.2: # %else @@ -3426,7 +3426,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3493,7 +3493,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -3657,7 +3657,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB12_1 @@ -3729,7 +3729,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB12_1 ; SSE4-NEXT: # %bb.2: # %else @@ -3800,7 +3800,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -3870,7 +3870,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4029,7 +4029,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4069,7 +4069,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4105,7 +4105,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB13_1 ; AVX-NEXT: # %bb.2: # %else @@ -4203,7 +4203,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packuswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -4243,7 +4243,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4279,7 +4279,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB14_1 ; AVX-NEXT: # %bb.2: # %else @@ -6209,7 +6209,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB17_1 @@ -6275,7 +6275,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE4-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB17_1 ; SSE4-NEXT: # %bb.2: # %else @@ -6340,7 +6340,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpmovmskb %xmm1, %eax -; AVX-NEXT: notl %eax +; AVX-NEXT: notb %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB17_1 ; AVX-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll index c950ce64e8883..67632ec389063 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -114,7 +114,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: # %cond.store @@ -221,7 +221,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB0_1 ; SSE4-NEXT: # %bb.2: # %else @@ -494,7 +494,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %else @@ -604,7 +604,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB1_1 ; SSE4-NEXT: # %bb.2: # %else @@ -693,7 +693,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -772,7 +772,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1042,7 +1042,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB2_1 @@ -1146,7 +1146,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB2_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1236,7 +1236,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1316,7 +1316,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1530,7 +1530,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2] ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1586,7 +1586,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1762,7 +1762,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1820,7 +1820,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1868,7 +1868,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1914,7 +1914,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2077,7 +2077,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm4, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -2136,7 +2136,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packsswb %xmm1, %xmm1 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm4 ; SSE4-NEXT: movmskps %xmm4, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2185,7 +2185,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -2232,7 +2232,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2367,7 +2367,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: movmskpd %xmm2, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2399,7 +2399,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2528,7 +2528,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: movmskpd %xmm2, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2562,7 +2562,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2591,7 +2591,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -2697,7 +2697,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -2729,7 +2729,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2757,7 +2757,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -4173,7 +4173,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB11_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4248,7 +4248,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB11_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4316,7 +4316,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4383,7 +4383,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4550,7 +4550,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB12_1 @@ -4619,7 +4619,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB12_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4688,7 +4688,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4756,7 +4756,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4921,7 +4921,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4961,7 +4961,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4997,7 +4997,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX-NEXT: vpackssdw %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB13_1 ; AVX-NEXT: # %bb.2: # %else @@ -5100,7 +5100,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm0, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -5141,7 +5141,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packsswb %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -5178,7 +5178,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskps %xmm1, %eax -; AVX-NEXT: xorl $15, %eax +; AVX-NEXT: xorb $15, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB14_1 ; AVX-NEXT: # %bb.2: # %else @@ -7104,7 +7104,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB17_1 @@ -7170,7 +7170,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE4-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB17_1 ; SSE4-NEXT: # %bb.2: # %else @@ -7235,7 +7235,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpmovmskb %xmm1, %eax -; AVX-NEXT: notl %eax +; AVX-NEXT: notb %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB17_1 ; AVX-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll index da057dd084b36..ff0e77086962f 100644 --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -56,7 +56,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB0_2 ; SSE2-NEXT: # %bb.1: # %cond.store @@ -156,7 +156,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB0_1 ; SSE4-NEXT: # %bb.2: # %else @@ -367,7 +367,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # %bb.2: # %else @@ -470,7 +470,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB1_1 ; SSE4-NEXT: # %bb.2: # %else @@ -559,7 +559,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB1_1 ; AVX1-NEXT: # %bb.2: # %else @@ -637,7 +637,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB1_1 ; AVX2-NEXT: # %bb.2: # %else @@ -845,7 +845,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm5, %xmm4 ; SSE2-NEXT: packsswb %xmm4, %xmm4 ; SSE2-NEXT: pmovmskb %xmm4, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm1, %ecx ; SSE2-NEXT: jne .LBB2_1 @@ -942,7 +942,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm5, %xmm4 ; SSE4-NEXT: packsswb %xmm4, %xmm4 ; SSE4-NEXT: pmovmskb %xmm4, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB2_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1032,7 +1032,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB2_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1111,7 +1111,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1291,7 +1291,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2] ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB3_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1344,7 +1344,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[0,2] ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB3_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1489,7 +1489,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm3, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB4_1 ; SSE2-NEXT: # %bb.2: # %else @@ -1544,7 +1544,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packusdw %xmm5, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB4_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1594,7 +1594,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB4_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1640,7 +1640,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB4_1 ; AVX2-NEXT: # %bb.2: # %else @@ -1770,7 +1770,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packuswb %xmm1, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm2, %xmm3 ; SSE2-NEXT: movmskps %xmm3, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: jne .LBB5_1 @@ -1826,7 +1826,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packuswb %xmm5, %xmm5 ; SSE4-NEXT: pcmpeqd %xmm2, %xmm6 ; SSE4-NEXT: movmskps %xmm6, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB5_1 ; SSE4-NEXT: # %bb.2: # %else @@ -1877,7 +1877,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB5_1 ; AVX1-NEXT: # %bb.2: # %else @@ -1924,7 +1924,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB5_1 ; AVX2-NEXT: # %bb.2: # %else @@ -2041,7 +2041,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB6_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2069,7 +2069,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB6_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2172,7 +2172,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB7_1 ; SSE2-NEXT: # %bb.2: # %else @@ -2202,7 +2202,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB7_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2228,7 +2228,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB7_1 ; AVX-NEXT: # %bb.2: # %else @@ -2315,7 +2315,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movmskpd %xmm0, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm3, %ecx ; SSE2-NEXT: jne .LBB8_1 @@ -2343,7 +2343,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; SSE4-NEXT: pcmpeqq %xmm1, %xmm3 ; SSE4-NEXT: movmskpd %xmm3, %eax -; SSE4-NEXT: xorl $3, %eax +; SSE4-NEXT: xorb $3, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB8_1 ; SSE4-NEXT: # %bb.2: # %else @@ -2368,7 +2368,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) { ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vmovmskpd %xmm1, %eax -; AVX-NEXT: xorl $3, %eax +; AVX-NEXT: xorb $3, %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB8_1 ; AVX-NEXT: # %bb.2: # %else @@ -3867,7 +3867,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB11_1 ; SSE2-NEXT: # %bb.2: # %else @@ -3945,7 +3945,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB11_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4016,7 +4016,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB11_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4085,7 +4085,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB11_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4264,7 +4264,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE2-NEXT: packssdw %xmm3, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm4, %ecx ; SSE2-NEXT: jne .LBB12_1 @@ -4336,7 +4336,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; SSE4-NEXT: packssdw %xmm3, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB12_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4408,7 +4408,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 ; AVX1-NEXT: vmovmskps %ymm1, %eax -; AVX1-NEXT: notl %eax +; AVX1-NEXT: notb %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB12_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4478,7 +4478,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) { ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB12_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4647,7 +4647,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %eax -; SSE2-NEXT: xorl $15, %eax +; SSE2-NEXT: xorb $15, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: jne .LBB13_1 ; SSE2-NEXT: # %bb.2: # %else @@ -4688,7 +4688,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packusdw %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB13_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4725,7 +4725,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB13_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4763,7 +4763,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB13_1 ; AVX2-NEXT: # %bb.2: # %else @@ -4872,7 +4872,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE2-NEXT: packuswb %xmm3, %xmm3 ; SSE2-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE2-NEXT: movmskps %xmm2, %ecx -; SSE2-NEXT: xorl $15, %ecx +; SSE2-NEXT: xorb $15, %cl ; SSE2-NEXT: testb $1, %cl ; SSE2-NEXT: movd %xmm3, %eax ; SSE2-NEXT: jne .LBB14_1 @@ -4914,7 +4914,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; SSE4-NEXT: packuswb %xmm0, %xmm0 ; SSE4-NEXT: pcmpeqd %xmm1, %xmm2 ; SSE4-NEXT: movmskps %xmm2, %eax -; SSE4-NEXT: xorl $15, %eax +; SSE4-NEXT: xorb $15, %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB14_1 ; SSE4-NEXT: # %bb.2: # %else @@ -4952,7 +4952,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vmovmskps %xmm1, %eax -; AVX1-NEXT: xorl $15, %eax +; AVX1-NEXT: xorb $15, %al ; AVX1-NEXT: testb $1, %al ; AVX1-NEXT: jne .LBB14_1 ; AVX1-NEXT: # %bb.2: # %else @@ -4991,7 +4991,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) { ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1 ; AVX2-NEXT: vmovmskps %xmm1, %eax -; AVX2-NEXT: xorl $15, %eax +; AVX2-NEXT: xorb $15, %al ; AVX2-NEXT: testb $1, %al ; AVX2-NEXT: jne .LBB14_1 ; AVX2-NEXT: # %bb.2: # %else @@ -6952,7 +6952,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE2-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE2-NEXT: packsswb %xmm2, %xmm2 ; SSE2-NEXT: pmovmskb %xmm2, %eax -; SSE2-NEXT: notl %eax +; SSE2-NEXT: notb %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: movd %xmm0, %ecx ; SSE2-NEXT: jne .LBB17_1 @@ -7019,7 +7019,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; SSE4-NEXT: pcmpeqw %xmm1, %xmm2 ; SSE4-NEXT: packsswb %xmm2, %xmm2 ; SSE4-NEXT: pmovmskb %xmm2, %eax -; SSE4-NEXT: notl %eax +; SSE4-NEXT: notb %al ; SSE4-NEXT: testb $1, %al ; SSE4-NEXT: jne .LBB17_1 ; SSE4-NEXT: # %bb.2: # %else @@ -7085,7 +7085,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) { ; AVX-NEXT: vpcmpeqw %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpacksswb %xmm1, %xmm1, %xmm1 ; AVX-NEXT: vpmovmskb %xmm1, %eax -; AVX-NEXT: notl %eax +; AVX-NEXT: notb %al ; AVX-NEXT: testb $1, %al ; AVX-NEXT: jne .LBB17_1 ; AVX-NEXT: # %bb.2: # %else diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll index c997d314a50ae..eed2b2fa01ad3 100644 --- a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll +++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll @@ -359,8 +359,8 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) { ; CHECK-LABEL: test_store_sptr32_trunc_i1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: andl $1, %ecx +; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; CHECK-NEXT: andb $1, %cl ; CHECK-NEXT: movb %cl, (%eax) ; CHECK-NEXT: retl ; @@ -368,8 +368,8 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) { ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: movl {{[0-9]+}}(%esp), %ecx ; CHECK-O0-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-O0-NEXT: andl $1, %ecx ; CHECK-O0-NEXT: # kill: def $cl killed $cl killed $ecx +; CHECK-O0-NEXT: andb $1, %cl ; CHECK-O0-NEXT: movb %cl, (%eax) ; CHECK-O0-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll index 48d0ea49b70e6..70fbefe715ff9 100644 --- a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll +++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll @@ -285,15 +285,15 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) { ; CHECK-LABEL: test_store_sptr32_trunc_i1: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: movslq %ecx, %rax -; CHECK-NEXT: andl $1, %edx +; CHECK-NEXT: andb $1, %dl ; CHECK-NEXT: movb %dl, (%rax) ; CHECK-NEXT: retq ; ; CHECK-O0-LABEL: test_store_sptr32_trunc_i1: ; CHECK-O0: # %bb.0: # %entry ; CHECK-O0-NEXT: movslq %ecx, %rax -; CHECK-O0-NEXT: andl $1, %edx ; CHECK-O0-NEXT: movb %dl, %cl +; CHECK-O0-NEXT: andb $1, %cl ; CHECK-O0-NEXT: movb %cl, (%rax) ; CHECK-O0-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll index 9b624a935bada..7b94072b21923 100644 --- a/llvm/test/CodeGen/X86/movmsk-cmp.ll +++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll @@ -3655,11 +3655,11 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) { ; SSE-NEXT: shrl $15, %ecx ; SSE-NEXT: movl %eax, %edx ; SSE-NEXT: shrl $8, %edx -; SSE-NEXT: andl $1, %edx +; SSE-NEXT: andb $1, %dl ; SSE-NEXT: andl $8, %eax ; SSE-NEXT: shrl $3, %eax -; SSE-NEXT: xorl %edx, %eax -; SSE-NEXT: andl %ecx, %eax +; SSE-NEXT: xorb %dl, %al +; SSE-NEXT: andb %cl, %al ; SSE-NEXT: # kill: def $al killed $al killed $eax ; SSE-NEXT: retq ; @@ -3671,11 +3671,11 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) { ; AVX1OR2-NEXT: shrl $15, %ecx ; AVX1OR2-NEXT: movl %eax, %edx ; AVX1OR2-NEXT: shrl $8, %edx -; AVX1OR2-NEXT: andl $1, %edx +; AVX1OR2-NEXT: andb $1, %dl ; AVX1OR2-NEXT: andl $8, %eax ; AVX1OR2-NEXT: shrl $3, %eax -; AVX1OR2-NEXT: xorl %edx, %eax -; AVX1OR2-NEXT: andl %ecx, %eax +; AVX1OR2-NEXT: xorb %dl, %al +; AVX1OR2-NEXT: andb %cl, %al ; AVX1OR2-NEXT: # kill: def $al killed $al killed $eax ; AVX1OR2-NEXT: retq ; @@ -3685,8 +3685,8 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) { ; KNL-NEXT: vpextrb $15, %xmm0, %ecx ; KNL-NEXT: vpextrb $8, %xmm0, %edx ; KNL-NEXT: vpextrb $3, %xmm0, %eax -; KNL-NEXT: xorl %edx, %eax -; KNL-NEXT: andl %ecx, %eax +; KNL-NEXT: xorb %dl, %al +; KNL-NEXT: andb %cl, %al ; KNL-NEXT: # kill: def $al killed $al killed $eax ; KNL-NEXT: retq ; @@ -4171,7 +4171,9 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2] ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: movmskpd %xmm1, %eax -; SSE2-NEXT: xorl $3, %eax +; SSE2-NEXT: xorb $3, %al +; Is this being performed by a different transofmrations that's using isNarrowingProfitable()? +; SSE2-NEXT: movzbl %al, %eax ; SSE2-NEXT: btl %edi, %eax ; SSE2-NEXT: setb %al ; SSE2-NEXT: retq @@ -4180,7 +4182,8 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { ; SSE41: # %bb.0: ; SSE41-NEXT: pcmpeqq %xmm1, %xmm0 ; SSE41-NEXT: movmskpd %xmm0, %eax -; SSE41-NEXT: xorl $3, %eax +; SSE41-NEXT: xorb $3, %al +; SSE41-NEXT: movzbl %al, %eax ; SSE41-NEXT: btl %edi, %eax ; SSE41-NEXT: setb %al ; SSE41-NEXT: retq @@ -4189,7 +4192,8 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) { ; AVX1OR2: # %bb.0: ; AVX1OR2-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vmovmskpd %xmm0, %eax -; AVX1OR2-NEXT: xorl $3, %eax +; AVX1OR2-NEXT: xorb $3, %al +; AVX1OR2-NEXT: movzbl %al, %eax ; AVX1OR2-NEXT: btl %edi, %eax ; AVX1OR2-NEXT: setb %al ; AVX1OR2-NEXT: retq diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll index a4fa1ee8c0029..c972805e071c6 100644 --- a/llvm/test/CodeGen/X86/mul-constant-i8.ll +++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll @@ -72,7 +72,7 @@ define i8 @test_mul_by_7(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (,%rdi,8), %eax -; X64-NEXT: subl %edi, %eax +; X64-NEXT: subb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 7 @@ -118,7 +118,8 @@ define i8 @test_mul_by_11(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,2), %eax +; X64-NEXT: addl %eax, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 11 @@ -142,7 +143,8 @@ define i8 @test_mul_by_13(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: shll $2, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 13 @@ -155,7 +157,7 @@ define i8 @test_mul_by_14(i8 %x) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: leal (%rax,%rax), %ecx ; X64-NEXT: shll $4, %eax -; X64-NEXT: subl %ecx, %eax +; X64-NEXT: subb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %m = mul i8 %x, 14 @@ -188,6 +190,7 @@ define i8 @test_mul_by_16(i8 %x) { define i8 @test_mul_by_17(i8 %x) { ; X64-LABEL: test_mul_by_17: ; X64: # %bb.0: +; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $4, %eax ; X64-NEXT: addl %edi, %eax @@ -214,7 +217,8 @@ define i8 @test_mul_by_19(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rdi,%rax,2), %eax +; X64-NEXT: addl %eax, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 19 @@ -238,7 +242,8 @@ define i8 @test_mul_by_21(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: shll $2, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 21 @@ -249,9 +254,10 @@ define i8 @test_mul_by_22(i8 %x) { ; X64-LABEL: test_mul_by_22: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: leal (%rdi,%rdi,4), %ecx +; X64-NEXT: shll $2, %ecx +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: addb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 22 @@ -264,7 +270,7 @@ define i8 @test_mul_by_23(i8 %x) { ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,2), %eax ; X64-NEXT: shll $3, %eax -; X64-NEXT: subl %edi, %eax +; X64-NEXT: subb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 23 @@ -301,7 +307,7 @@ define i8 @test_mul_by_26(i8 %x) { ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,4), %eax ; X64-NEXT: leal (%rax,%rax,4), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 26 @@ -326,7 +332,7 @@ define i8 @test_mul_by_28(i8 %x) { ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax ; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 28 @@ -338,9 +344,9 @@ define i8 @test_mul_by_29(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rax,%rax,2), %eax -; X64-NEXT: addl %edi, %eax -; X64-NEXT: addl %edi, %eax +; X64-NEXT: leal (%rax,%rax,2), %ecx +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: addb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 29 @@ -353,7 +359,7 @@ define i8 @test_mul_by_30(i8 %x) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: leal (%rax,%rax), %ecx ; X64-NEXT: shll $5, %eax -; X64-NEXT: subl %ecx, %eax +; X64-NEXT: subb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %m = mul i8 %x, 30 @@ -365,7 +371,7 @@ define i8 @test_mul_by_31(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax ; X64-NEXT: shll $5, %eax -; X64-NEXT: subl %edi, %eax +; X64-NEXT: subb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 31 @@ -388,7 +394,8 @@ define i8 @test_mul_by_37(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax +; X64-NEXT: shll $2, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 37 @@ -400,7 +407,8 @@ define i8 @test_mul_by_41(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: leal (%rdi,%rax,8), %eax +; X64-NEXT: shll $3, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 41 @@ -413,7 +421,7 @@ define i8 @test_mul_by_62(i8 %x) { ; X64-NEXT: movl %edi, %eax ; X64-NEXT: leal (%rax,%rax), %ecx ; X64-NEXT: shll $6, %eax -; X64-NEXT: subl %ecx, %eax +; X64-NEXT: subb %cl, %al ; X64-NEXT: # kill: def $al killed $al killed $rax ; X64-NEXT: retq %m = mul i8 %x, 62 @@ -424,9 +432,9 @@ define i8 @test_mul_by_66(i8 %x) { ; X64-LABEL: test_mul_by_66: ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi -; X64-NEXT: movl %edi, %eax -; X64-NEXT: shll $6, %eax -; X64-NEXT: leal (%rax,%rdi,2), %eax +; X64-NEXT: leal (%rdi,%rdi), %eax +; X64-NEXT: shll $6, %edi +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 66 @@ -438,7 +446,8 @@ define i8 @test_mul_by_73(i8 %x) { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: leal (%rdi,%rax,8), %eax +; X64-NEXT: shll $3, %eax +; X64-NEXT: addb %dil, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, 73 @@ -462,7 +471,7 @@ define i8 @test_mul_by_neg10(i8 %x) { ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: addl %edi, %edi ; X64-NEXT: leal (%rdi,%rdi,4), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -10 @@ -475,7 +484,7 @@ define i8 @test_mul_by_neg36(i8 %x) { ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: shll $2, %edi ; X64-NEXT: leal (%rdi,%rdi,8), %eax -; X64-NEXT: negl %eax +; X64-NEXT: negb %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq %m = mul i8 %x, -36 diff --git a/llvm/test/CodeGen/X86/or-with-overflow.ll b/llvm/test/CodeGen/X86/or-with-overflow.ll index b3ffa209bc700..c7b528a88ca0c 100644 --- a/llvm/test/CodeGen/X86/or-with-overflow.ll +++ b/llvm/test/CodeGen/X86/or-with-overflow.ll @@ -46,8 +46,8 @@ define i8 @or_i8_rr(i8 zeroext %0, i8 zeroext %1) { ; ; X64-LABEL: or_i8_rr: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: orl %edi, %eax +; X64-NEXT: orb %dil, %sil +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll index f9a2411465141..cb525a680b1a7 100644 --- a/llvm/test/CodeGen/X86/parity-vec.ll +++ b/llvm/test/CodeGen/X86/parity-vec.ll @@ -16,7 +16,7 @@ define i1 @noncanonical_parity(<16 x i1> %x) { ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: andl $1, %eax +; POPCNT-NEXT: andb $1, %al ; POPCNT-NEXT: # kill: def $al killed $al killed $eax ; POPCNT-NEXT: retq %r = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x) @@ -36,7 +36,7 @@ define i1 @canonical_parity(<16 x i1> %x) { ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: andl $1, %eax +; POPCNT-NEXT: andb $1, %al ; POPCNT-NEXT: # kill: def $al killed $al killed $eax ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 @@ -65,7 +65,7 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) { ; NOPOPCNT-NEXT: andl $3855, %ecx # imm = 0xF0F ; NOPOPCNT-NEXT: movl %ecx, %eax ; NOPOPCNT-NEXT: shrl $8, %eax -; NOPOPCNT-NEXT: addl %ecx, %eax +; NOPOPCNT-NEXT: addb %cl, %al ; NOPOPCNT-NEXT: # kill: def $al killed $al killed $eax ; NOPOPCNT-NEXT: retq ; @@ -97,8 +97,8 @@ define i1 @noncanonical_nonparity(<16 x i1> %x) { ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: andl $1, %eax -; POPCNT-NEXT: xorb $1, %al +; POPCNT-NEXT: notb %al +; POPCNT-NEXT: andb $1, %al ; POPCNT-NEXT: # kill: def $al killed $al killed $eax ; POPCNT-NEXT: retq %r.inv = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x) @@ -142,8 +142,8 @@ define i1 @canonical_nonparity_noncanonical_pred(<16 x i1> %x) { ; POPCNT-NEXT: psllw $7, %xmm0 ; POPCNT-NEXT: pmovmskb %xmm0, %eax ; POPCNT-NEXT: popcntl %eax, %eax -; POPCNT-NEXT: andl $1, %eax -; POPCNT-NEXT: xorb $1, %al +; POPCNT-NEXT: notb %al +; POPCNT-NEXT: andb $1, %al ; POPCNT-NEXT: # kill: def $al killed $al killed $eax ; POPCNT-NEXT: retq %i1 = bitcast <16 x i1> %x to i16 diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll index 420f5ba5ab433..2c3199db6ec65 100644 --- a/llvm/test/CodeGen/X86/parity.ll +++ b/llvm/test/CodeGen/X86/parity.ll @@ -318,14 +318,14 @@ define i8 @parity_32_trunc(i32 %x) { ; X86-POPCNT-LABEL: parity_32_trunc: ; X86-POPCNT: # %bb.0: ; X86-POPCNT-NEXT: popcntl {{[0-9]+}}(%esp), %eax -; X86-POPCNT-NEXT: andl $1, %eax +; X86-POPCNT-NEXT: andb $1, %al ; X86-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X86-POPCNT-NEXT: retl ; ; X64-POPCNT-LABEL: parity_32_trunc: ; X64-POPCNT: # %bb.0: ; X64-POPCNT-NEXT: popcntl %edi, %eax -; X64-POPCNT-NEXT: andl $1, %eax +; X64-POPCNT-NEXT: andb $1, %al ; X64-POPCNT-NEXT: # kill: def $al killed $al killed $eax ; X64-POPCNT-NEXT: retq %1 = tail call i32 @llvm.ctpop.i32(i32 %x) @@ -381,9 +381,8 @@ define i16 @parity_16_mask255(i16 %x) { define i16 @parity_16_mask15(i16 %x) { ; X86-LABEL: parity_16_mask15: ; X86: # %bb.0: -; X86-NEXT: movzwl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %eax, %eax -; X86-NEXT: testb $15, %cl +; X86-NEXT: testb $15, {{[0-9]+}}(%esp) ; X86-NEXT: setnp %al ; X86-NEXT: # kill: def $ax killed $ax killed $eax ; X86-NEXT: retl diff --git a/llvm/test/CodeGen/X86/pr108731.ll b/llvm/test/CodeGen/X86/pr108731.ll index 2983d108eaedd..2f51daf0f19ef 100644 --- a/llvm/test/CodeGen/X86/pr108731.ll +++ b/llvm/test/CodeGen/X86/pr108731.ll @@ -95,12 +95,12 @@ Entry: define i8 @test_i8(i8 %w, i8 %x, i8 %y, i8 %z) { ; CHECK-LABEL: test_i8: ; CHECK: # %bb.0: # %Entry -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %esi +; CHECK-NEXT: movl %ecx, %eax +; CHECK-NEXT: andb %dl, %sil ; CHECK-NEXT: notb %sil ; CHECK-NEXT: andb %dil, %sil -; CHECK-NEXT: notb %cl -; CHECK-NEXT: orb %cl, %al +; CHECK-NEXT: notb %al +; CHECK-NEXT: orb %dl, %al ; CHECK-NEXT: andb %sil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr14088.ll b/llvm/test/CodeGen/X86/pr14088.ll index 83bf13280f94a..32a5155c41b90 100644 --- a/llvm/test/CodeGen/X86/pr14088.ll +++ b/llvm/test/CodeGen/X86/pr14088.ll @@ -29,10 +29,10 @@ define i32 @f(i1 %foo, ptr %tm_year2, ptr %bar, i16 %zed, i32 %zed2) { ; CHECK-NEXT: movswq %ax, %rax ; CHECK-NEXT: imulq $1717986919, %rax, %rax # imm = 0x66666667 ; CHECK-NEXT: movq %rax, %rcx -; CHECK-NEXT: shrq $63, %rcx -; CHECK-NEXT: shrq $34, %rax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: movb %al, (%rdx) +; CHECK-NEXT: shrq $34, %rcx +; CHECK-NEXT: shrq $63, %rax +; CHECK-NEXT: addb %al, %cl +; CHECK-NEXT: movb %cl, (%rdx) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: .LBB0_2: # %return ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll index 5083eac71dce0..89ef4cb2201e4 100644 --- a/llvm/test/CodeGen/X86/pr15267.ll +++ b/llvm/test/CodeGen/X86/pr15267.ll @@ -84,62 +84,61 @@ define <16 x i4> @test4(ptr %in) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: movq (%rdi), %rax ; CHECK-NEXT: movl %eax, %ecx -; CHECK-NEXT: shrl $4, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: shrb $4, %cl ; CHECK-NEXT: movl %eax, %edx -; CHECK-NEXT: andl $15, %edx +; CHECK-NEXT: andb $15, %dl ; CHECK-NEXT: vmovd %edx, %xmm0 ; CHECK-NEXT: vpinsrb $1, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $8, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $2, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $12, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $3, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $16, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $4, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $20, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $5, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $24, %ecx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $6, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movl %eax, %ecx ; CHECK-NEXT: shrl $28, %ecx ; CHECK-NEXT: vpinsrb $7, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $32, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $8, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $36, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $9, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $40, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $10, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $44, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $11, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $48, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $12, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $52, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $13, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: movq %rax, %rcx ; CHECK-NEXT: shrq $56, %rcx -; CHECK-NEXT: andl $15, %ecx +; CHECK-NEXT: andb $15, %cl ; CHECK-NEXT: vpinsrb $14, %ecx, %xmm0, %xmm0 ; CHECK-NEXT: shrq $60, %rax ; CHECK-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll index d9671aa04f460..4a4082a2a59ed 100644 --- a/llvm/test/CodeGen/X86/pr32329.ll +++ b/llvm/test/CodeGen/X86/pr32329.ll @@ -29,33 +29,40 @@ define void @foo() local_unnamed_addr { ; X86-NEXT: .cfi_offset %edi, -16 ; X86-NEXT: .cfi_offset %ebx, -12 ; X86-NEXT: .cfi_offset %ebp, -8 -; X86-NEXT: movsbl var_27, %eax -; X86-NEXT: movzwl var_2, %ebx -; X86-NEXT: movl var_310, %ecx -; X86-NEXT: imull %eax, %ecx -; X86-NEXT: addl var_24, %ecx -; X86-NEXT: movl $4194303, %esi # imm = 0x3FFFFF -; X86-NEXT: andl obj, %esi +; X86-NEXT: movl obj, %esi ; X86-NEXT: leal (%esi,%esi), %edx -; X86-NEXT: subl %eax, %edx -; X86-NEXT: movl %edx, %edi -; X86-NEXT: subl %ebx, %edi -; X86-NEXT: imull %edi, %ecx +; X86-NEXT: movzwl var_2, %eax +; X86-NEXT: movb var_310, %ch +; X86-NEXT: movsbl var_27, %ebx +; X86-NEXT: subb %bl, %dl +; X86-NEXT: movb %dl, %cl +; X86-NEXT: subb %al, %cl +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movb %ch, %al +; X86-NEXT: mulb %bl +; X86-NEXT: addb var_24, %al +; X86-NEXT: mulb %cl +; X86-NEXT: movl %eax, %ecx ; X86-NEXT: addb $113, %cl -; X86-NEXT: movl $9, %ebx -; X86-NEXT: xorl %ebp, %ebp -; X86-NEXT: shldl %cl, %ebx, %ebp -; X86-NEXT: shll %cl, %ebx +; X86-NEXT: movl $9, %eax +; X86-NEXT: xorl %edi, %edi +; X86-NEXT: shldl %cl, %eax, %edi +; X86-NEXT: shll %cl, %eax ; X86-NEXT: testb $32, %cl -; X86-NEXT: cmovnel %ebx, %ebp +; X86-NEXT: cmovnel %eax, %edi ; X86-NEXT: movl $0, %ecx -; X86-NEXT: cmovnel %ecx, %ebx -; X86-NEXT: cmpl %esi, %edi -; X86-NEXT: movl %ebp, var_50+4 -; X86-NEXT: movl %ebx, var_50 +; X86-NEXT: cmovnel %ecx, %eax +; X86-NEXT: andl $4194303, %esi # imm = 0x3FFFFF +; X86-NEXT: leal (%esi,%esi), %ecx +; X86-NEXT: subl %ebx, %ecx +; X86-NEXT: subl %ebp, %ecx +; X86-NEXT: cmpl %esi, %ecx +; X86-NEXT: movl %edi, var_50+4 +; X86-NEXT: movl %eax, var_50 ; X86-NEXT: setge var_205 -; X86-NEXT: imull %eax, %edx -; X86-NEXT: movb %dl, var_218 +; X86-NEXT: movl %edx, %eax +; X86-NEXT: mulb %bl +; X86-NEXT: movb %al, var_218 ; X86-NEXT: popl %esi ; X86-NEXT: .cfi_def_cfa_offset 16 ; X86-NEXT: popl %edi @@ -68,27 +75,33 @@ define void @foo() local_unnamed_addr { ; ; X64-LABEL: foo: ; X64: # %bb.0: # %entry -; X64-NEXT: movsbl var_27(%rip), %eax -; X64-NEXT: movzwl var_2(%rip), %edx -; X64-NEXT: movl var_310(%rip), %ecx -; X64-NEXT: imull %eax, %ecx -; X64-NEXT: addl var_24(%rip), %ecx -; X64-NEXT: movl $4194303, %esi # imm = 0x3FFFFF -; X64-NEXT: andl obj(%rip), %esi -; X64-NEXT: leal (%rsi,%rsi), %edi -; X64-NEXT: subl %eax, %edi -; X64-NEXT: movl %edi, %r8d -; X64-NEXT: subl %edx, %r8d -; X64-NEXT: imull %r8d, %ecx -; X64-NEXT: addb $113, %cl -; X64-NEXT: movl $9, %edx +; X64-NEXT: movl obj(%rip), %esi +; X64-NEXT: leal (%rsi,%rsi), %edx +; X64-NEXT: movzwl var_2(%rip), %ecx +; X64-NEXT: movzwl %cx, %r8d +; X64-NEXT: movzbl var_310(%rip), %eax +; X64-NEXT: movsbl var_27(%rip), %edi +; X64-NEXT: subb %dil, %dl +; X64-NEXT: movl %edx, %r9d +; X64-NEXT: subb %cl, %r9b +; X64-NEXT: mulb %dil +; X64-NEXT: addb var_24(%rip), %al +; X64-NEXT: mulb %r9b +; X64-NEXT: # kill: def $al killed $al def $rax +; X64-NEXT: leal 113(%rax), %ecx +; X64-NEXT: movl $9, %eax ; X64-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NEXT: shlq %cl, %rdx -; X64-NEXT: movq %rdx, var_50(%rip) -; X64-NEXT: cmpl %esi, %r8d +; X64-NEXT: shlq %cl, %rax +; X64-NEXT: movq %rax, var_50(%rip) +; X64-NEXT: andl $4194303, %esi # imm = 0x3FFFFF +; X64-NEXT: leal (%rsi,%rsi), %eax +; X64-NEXT: subl %edi, %eax +; X64-NEXT: subl %r8d, %eax +; X64-NEXT: cmpl %esi, %eax ; X64-NEXT: setge var_205(%rip) -; X64-NEXT: imull %eax, %edi -; X64-NEXT: movb %dil, var_218(%rip) +; X64-NEXT: movl %edx, %eax +; X64-NEXT: mulb %dil +; X64-NEXT: movb %al, var_218(%rip) ; X64-NEXT: retq entry: %bf.load = load i32, ptr @obj, align 8 diff --git a/llvm/test/CodeGen/X86/pr35761.ll b/llvm/test/CodeGen/X86/pr35761.ll index 5661b6775ab9d..804366d93203b 100644 --- a/llvm/test/CodeGen/X86/pr35761.ll +++ b/llvm/test/CodeGen/X86/pr35761.ll @@ -8,10 +8,10 @@ define dso_local void @PR35761(i32 %call) { ; CHECK-LABEL: PR35761: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movzbl x(%rip), %eax -; CHECK-NEXT: andl $1, %eax -; CHECK-NEXT: movzbl y(%rip), %ecx -; CHECK-NEXT: xorl $255, %ecx +; CHECK-NEXT: movzbl y(%rip), %eax +; CHECK-NEXT: xorl $255, %eax +; CHECK-NEXT: movzbl x(%rip), %ecx +; CHECK-NEXT: andl $1, %ecx ; CHECK-NEXT: orl %eax, %ecx ; CHECK-NEXT: movw %cx, z(%rip) ; CHECK-NEXT: movb $0, z+2(%rip) diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll index 56d80a025fa08..a1df2d8f69238 100644 --- a/llvm/test/CodeGen/X86/pr40539.ll +++ b/llvm/test/CodeGen/X86/pr40539.ll @@ -18,7 +18,7 @@ define zeroext i1 @_Z9test_log2v() { ; CHECK-NEXT: xorps %xmm0, %xmm0 ; CHECK-NEXT: cmpeqss (%esp), %xmm0 ; CHECK-NEXT: movd %xmm0, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: popl %ecx ; CHECK-NEXT: .cfi_def_cfa_offset 4 diff --git a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll index a0bd35d5d219b..c330d3b5bd436 100644 --- a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll +++ b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll @@ -15,7 +15,7 @@ define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr { ; ; X86-LABEL: f32_bzhi: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl entry: @@ -34,7 +34,7 @@ define i32 @f32_bzhi_commute(i32 %x, i32 %y) local_unnamed_addr { ; ; X86-LABEL: f32_bzhi_commute: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl entry: @@ -53,7 +53,7 @@ define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr { ; ; X86-LABEL: f32_bzhi_partial: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl entry: @@ -72,7 +72,7 @@ define i32 @f32_bzhi_partial_commute(i32 %x, i32 %y) local_unnamed_addr { ; ; X86-LABEL: f32_bzhi_partial_commute: ; X86: # %bb.0: # %entry -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax ; X86-NEXT: bzhil %eax, {{[0-9]+}}(%esp), %eax ; X86-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll index 874913629e9e3..aba13cf442608 100644 --- a/llvm/test/CodeGen/X86/scmp.ll +++ b/llvm/test/CodeGen/X86/scmp.ll @@ -2393,7 +2393,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind { ; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-NEXT: movl %eax, 12(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: andl $7, %eax +; X86-NEXT: andb $7, %al ; X86-NEXT: movb %al, 102(%edi) ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NEXT: shll $30, %eax diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll index e7727a0ab6178..f8a0698208ebd 100644 --- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll +++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll @@ -597,7 +597,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: subq $1, %rbp ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: xorb %r15b, %bl ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload @@ -648,7 +648,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: subq $1, %rbp ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: xorb %r15b, %bl ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload @@ -710,7 +710,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: subq $1, %rbp ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: xorb %r15b, %bl ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload @@ -760,7 +760,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind { ; X64-NEXT: subq $1, %rbp ; X64-NEXT: sbbq $0, %r14 ; X64-NEXT: shrq $63, %rbx -; X64-NEXT: xorl %r15d, %ebx +; X64-NEXT: xorb %r15b, %bl ; X64-NEXT: movq %r12, %rdi ; X64-NEXT: movq %r13, %rsi ; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload diff --git a/llvm/test/CodeGen/X86/setcc-combine.ll b/llvm/test/CodeGen/X86/setcc-combine.ll index f526db00df606..9ae5f34a38e76 100644 --- a/llvm/test/CodeGen/X86/setcc-combine.ll +++ b/llvm/test/CodeGen/X86/setcc-combine.ll @@ -262,7 +262,7 @@ define void @test_i1_uge(ptr%A2) { define i64 @PR40657(i8 %var2, i8 %var9) { ; CHECK-LABEL: PR40657: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %esi, %edi +; CHECK-NEXT: xorb %sil, %dil ; CHECK-NEXT: notb %dil ; CHECK-NEXT: movzbl %dil, %eax ; CHECK-NEXT: andl $1, %eax diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll index c98aae7fbf405..3d9105922e316 100644 --- a/llvm/test/CodeGen/X86/setcc-logic.ll +++ b/llvm/test/CodeGen/X86/setcc-logic.ll @@ -454,9 +454,9 @@ define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind { define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind { ; CHECK-LABEL: and_eq: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %esi, %edi -; CHECK-NEXT: xorl %ecx, %edx -; CHECK-NEXT: orb %dl, %dil +; CHECK-NEXT: xorb %sil, %dil +; CHECK-NEXT: xorb %cl, %dl +; CHECK-NEXT: orb %dil, %dl ; CHECK-NEXT: sete %al ; CHECK-NEXT: retq %cmp1 = icmp eq i8 %a, %b @@ -468,9 +468,9 @@ define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind { define zeroext i1 @or_ne(i8 %a, i8 %b, i8 %c, i8 %d) nounwind { ; CHECK-LABEL: or_ne: ; CHECK: # %bb.0: -; CHECK-NEXT: xorl %esi, %edi -; CHECK-NEXT: xorl %ecx, %edx -; CHECK-NEXT: orb %dl, %dil +; CHECK-NEXT: xorb %sil, %dil +; CHECK-NEXT: xorb %cl, %dl +; CHECK-NEXT: orb %dil, %dl ; CHECK-NEXT: setne %al ; CHECK-NEXT: retq %cmp1 = icmp ne i8 %a, %b diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll index f0addf4b64599..d30e9395af045 100644 --- a/llvm/test/CodeGen/X86/setoeq.ll +++ b/llvm/test/CodeGen/X86/setoeq.ll @@ -9,7 +9,7 @@ define zeroext i8 @t(double %x) nounwind readnone { ; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 ; CHECK-NEXT: cmpeqsd %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl entry: @@ -28,7 +28,7 @@ define zeroext i8 @u(double %x) nounwind readnone { ; CHECK-NEXT: cvtdq2pd %xmm1, %xmm1 ; CHECK-NEXT: cmpneqsd %xmm0, %xmm1 ; CHECK-NEXT: movd %xmm1, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl entry: diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll index 9f7ac748c47e1..c212e028b6c90 100644 --- a/llvm/test/CodeGen/X86/shift-amount-mod.ll +++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll @@ -735,10 +735,9 @@ define i32 @reg32_lshr_by_sub_from_negated(i32 %val, i32 %a, i32 %b) nounwind { ; X86-LABEL: reg32_lshr_by_sub_from_negated: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: negb %cl -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -763,8 +762,8 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addb {{[0-9]+}}(%esp), %dl ; X86-NEXT: movb $64, %cl ; X86-NEXT: subb %dl, %cl ; X86-NEXT: movl %esi, %edx @@ -782,7 +781,7 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X64-LABEL: reg64_lshr_by_sub_from_negated: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -800,9 +799,8 @@ define i32 @reg32_lshr_by_sub_of_negated(i32 %val, i32 %a, i32 %b) nounwind { ; X86-LABEL: reg32_lshr_by_sub_of_negated: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -826,8 +824,8 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: addb $-64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -844,7 +842,7 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X64-LABEL: reg64_lshr_by_sub_of_negated: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -862,9 +860,8 @@ define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind { ; X86-LABEL: reg32_lshr_by_add_to_negated: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -872,7 +869,7 @@ define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: retq @@ -887,8 +884,8 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: addb $64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -906,7 +903,7 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind { ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -923,9 +920,8 @@ define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind ; X86-LABEL: reg32_lshr_by_sub_of_negated_amts: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -933,7 +929,7 @@ define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: retq @@ -949,8 +945,8 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx ; X86-NEXT: shrdl %cl, %esi, %eax @@ -967,7 +963,7 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -985,10 +981,9 @@ define i32 @reg32_lshr_by_add_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind ; X86-LABEL: reg32_lshr_by_add_of_negated_amts: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: negb %cl -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -1014,8 +1009,8 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addb {{[0-9]+}}(%esp), %dl ; X86-NEXT: movb $-128, %cl ; X86-NEXT: subb %dl, %cl ; X86-NEXT: movl %esi, %edx @@ -1033,7 +1028,7 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind ; X64-LABEL: reg64_lshr_by_add_of_negated_amts: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -1109,10 +1104,9 @@ define i32 @reg32_lshr_by_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b) nounw ; X86-LABEL: reg32_lshr_by_negated_unfolded_sub_b: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: negb %cl -; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -1138,8 +1132,8 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-NEXT: addl {{[0-9]+}}(%esp), %edx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: addb {{[0-9]+}}(%esp), %dl ; X86-NEXT: movb $64, %cl ; X86-NEXT: subb %dl, %cl ; X86-NEXT: movl %esi, %edx @@ -1157,7 +1151,7 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw ; X64-LABEL: reg64_lshr_by_negated_unfolded_sub_b: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: negb %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax @@ -1173,9 +1167,8 @@ define i32 @reg32_lshr_by_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounw ; X86-LABEL: reg32_lshr_by_b_sub_negated_unfolded: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -1200,8 +1193,8 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: addb $-64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -1218,7 +1211,7 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw ; X64-LABEL: reg64_lshr_by_b_sub_negated_unfolded: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1233,9 +1226,8 @@ define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounw ; X86-LABEL: reg32_lshr_by_negated_unfolded_add_b: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -1243,7 +1235,7 @@ define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounw ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: retq @@ -1259,8 +1251,8 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: addb $64, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx @@ -1278,7 +1270,7 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1355,9 +1347,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $31, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NEXT: andb $31, %cl +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl @@ -1366,9 +1358,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b ; X64: # %bb.0: ; X64-NEXT: movl %esi, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: negl %ecx -; X64-NEXT: andl $31, %ecx -; X64-NEXT: subl %edx, %ecx +; X64-NEXT: negb %cl +; X64-NEXT: andb $31, %cl +; X64-NEXT: subb %dl, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: retq @@ -1385,9 +1377,10 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $63, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subb %dl, %cl +; X86-NEXT: andb $63, %cl +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx ; X86-NEXT: shrdl %cl, %esi, %eax @@ -1404,9 +1397,9 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b ; X64: # %bb.0: ; X64-NEXT: movq %rsi, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: negl %ecx -; X64-NEXT: andl $63, %ecx -; X64-NEXT: subl %edx, %ecx +; X64-NEXT: negb %cl +; X64-NEXT: andb $63, %cl +; X64-NEXT: subb %dl, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1421,12 +1414,11 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b ; X86-LABEL: reg32_lshr_by_masked_b_sub_negated_unfolded: ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl $31, %edx -; X86-NEXT: subl %edx, %ecx -; X86-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %dl +; X86-NEXT: andb $31, %dl +; X86-NEXT: subb %dl, %cl ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl ; @@ -1434,9 +1426,9 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b ; X64: # %bb.0: ; X64-NEXT: movl %edx, %ecx ; X64-NEXT: movl %edi, %eax -; X64-NEXT: negl %esi -; X64-NEXT: andl $31, %esi -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: negb %sil +; X64-NEXT: andb $31, %sil +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax ; X64-NEXT: retq @@ -1452,11 +1444,12 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b ; X86-NEXT: pushl %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: xorl %edx, %edx -; X86-NEXT: subl {{[0-9]+}}(%esp), %edx -; X86-NEXT: andl $63, %edx -; X86-NEXT: subl %edx, %ecx +; X86-NEXT: movb {{[0-9]+}}(%esp), %ch +; X86-NEXT: subb %ch, %dl +; X86-NEXT: andb $63, %dl +; X86-NEXT: subb %dl, %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx ; X86-NEXT: shrdl %cl, %esi, %eax @@ -1473,9 +1466,9 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b ; X64: # %bb.0: ; X64-NEXT: movq %rdx, %rcx ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: negl %esi -; X64-NEXT: andl $63, %esi -; X64-NEXT: subl %esi, %ecx +; X64-NEXT: negb %sil +; X64-NEXT: andb $63, %sil +; X64-NEXT: subb %sil, %cl ; X64-NEXT: # kill: def $cl killed $cl killed $rcx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq @@ -1491,9 +1484,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $31, %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: subb {{[0-9]+}}(%esp), %cl +; X86-NEXT: andb $31, %cl +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NEXT: shrl %cl, %eax ; X86-NEXT: retl @@ -1503,8 +1496,8 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b ; X64-NEXT: # kill: def $edx killed $edx def $rdx ; X64-NEXT: # kill: def $esi killed $esi def $rsi ; X64-NEXT: movl %edi, %eax -; X64-NEXT: negl %esi -; X64-NEXT: andl $31, %esi +; X64-NEXT: negb %sil +; X64-NEXT: andb $31, %sil ; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrl %cl, %eax @@ -1522,9 +1515,10 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: xorl %ecx, %ecx -; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: andl $63, %ecx -; X86-NEXT: addl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx +; X86-NEXT: subb %dl, %cl +; X86-NEXT: andb $63, %cl +; X86-NEXT: addb {{[0-9]+}}(%esp), %cl ; X86-NEXT: movl %esi, %edx ; X86-NEXT: shrl %cl, %edx ; X86-NEXT: shrdl %cl, %esi, %eax @@ -1540,9 +1534,9 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b ; X64-LABEL: reg64_lshr_by_masked_negated_unfolded_add_b: ; X64: # %bb.0: ; X64-NEXT: movq %rdi, %rax -; X64-NEXT: negl %esi -; X64-NEXT: andl $63, %esi -; X64-NEXT: leal (%rdx,%rsi), %ecx +; X64-NEXT: negb %sil +; X64-NEXT: andb $63, %sil +; X64-NEXT: leal (%rsi,%rdx), %ecx ; X64-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NEXT: shrq %cl, %rax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 767bd772ab7a3..bc2522519c164 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -255,86 +255,84 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %esi ; i686-NEXT: andl $-16, %esp ; i686-NEXT: subl $112, %esp -; i686-NEXT: movl 40(%ebp), %edx +; i686-NEXT: movl 40(%ebp), %ecx ; i686-NEXT: movl 24(%ebp), %eax -; i686-NEXT: movl 28(%ebp), %ecx -; i686-NEXT: movl 32(%ebp), %esi -; i686-NEXT: movl 20(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 16(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 12(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 8(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 36(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 28(%ebp), %edi +; i686-NEXT: movl 32(%ebp), %edx +; i686-NEXT: movl 20(%ebp), %esi ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; i686-NEXT: movl 16(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 12(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 8(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 36(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, %ebx -; i686-NEXT: andl $31, %ebx -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 40(%esp,%edx), %eax -; i686-NEXT: movl 36(%esp,%edx), %esi +; i686-NEXT: movl %ecx, %eax +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $12, %eax +; i686-NEXT: movl 40(%esp,%eax), %edx +; i686-NEXT: movl 36(%esp,%eax), %esi ; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %eax, %esi +; i686-NEXT: shrdl %cl, %edx, %esi ; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 32(%esp,%edx), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 44(%esp,%edx), %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl %ebx, %esi -; i686-NEXT: shrdl %cl, %edx, %eax +; i686-NEXT: movl 32(%esp,%eax), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 44(%esp,%eax), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shrdl %cl, %eax, %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl 56(%ebp), %edx -; i686-NEXT: movl %edx, %eax -; i686-NEXT: andl $31, %eax -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 72(%esp,%edx), %ebx -; i686-NEXT: movl 68(%esp,%edx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %ebx, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 64(%esp,%edx), %edi -; i686-NEXT: movl 76(%esp,%edx), %edx -; i686-NEXT: shrdl %cl, %edx, %ebx -; i686-NEXT: movl %esi, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: movl 56(%ebp), %ecx +; i686-NEXT: movl %ecx, %esi +; i686-NEXT: shrl $3, %esi +; i686-NEXT: andl $12, %esi +; i686-NEXT: movl 72(%esp,%esi), %edi +; i686-NEXT: movl 68(%esp,%esi), %ebx +; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrdl %cl, %edi, %ebx +; i686-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 64(%esp,%esi), %edx +; i686-NEXT: movl 76(%esp,%esi), %esi +; i686-NEXT: movl %ecx, %ebx +; i686-NEXT: shrdl %cl, %esi, %edi +; i686-NEXT: movl 40(%ebp), %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: shrl %cl, %eax +; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; i686-NEXT: shrdl %cl, %ebx, %edx +; i686-NEXT: movl 56(%ebp), %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: movl %eax, %ecx +; i686-NEXT: shrl %cl, %esi +; i686-NEXT: movl 72(%ebp), %ecx +; i686-NEXT: movl %esi, 28(%ecx) +; i686-NEXT: movl %edi, 24(%ecx) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shrdl %cl, %esi, %edi -; i686-NEXT: shrl %cl, %edx -; i686-NEXT: movl 72(%ebp), %eax -; i686-NEXT: movl %edx, 28(%eax) -; i686-NEXT: movl %ebx, 24(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 20(%eax) -; i686-NEXT: movl %edi, 16(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 12(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 8(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 4(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: movl %esi, 20(%ecx) +; i686-NEXT: movl %edx, 16(%ecx) +; i686-NEXT: movl %eax, 12(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 8(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 4(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, (%ecx) ; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi @@ -382,89 +380,86 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no ; i686-NEXT: pushl %esi ; i686-NEXT: andl $-16, %esp ; i686-NEXT: subl $112, %esp -; i686-NEXT: movl 40(%ebp), %edx +; i686-NEXT: movl 40(%ebp), %ecx ; i686-NEXT: movl 24(%ebp), %eax -; i686-NEXT: movl 28(%ebp), %ecx -; i686-NEXT: movl 32(%ebp), %esi -; i686-NEXT: movl 16(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 12(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 8(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 20(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: sarl $31, %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl 36(%ebp), %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 28(%ebp), %edi +; i686-NEXT: movl 32(%ebp), %edx +; i686-NEXT: movl 16(%ebp), %esi ; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: sarl $31, %edi -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 12(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 8(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 20(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: sarl $31, %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl 36(%ebp), %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %edi, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edx, %eax -; i686-NEXT: andl $31, %eax -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 40(%esp,%edx), %esi -; i686-NEXT: movl 36(%esp,%edx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shrdl %cl, %esi, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 32(%esp,%edx), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 44(%esp,%edx), %edx -; i686-NEXT: movl %edx, (%esp) # 4-byte Spill -; i686-NEXT: movl %eax, %ecx +; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) +; i686-NEXT: sarl $31, %esi +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %esi, {{[0-9]+}}(%esp) +; i686-NEXT: movl %ecx, %eax +; i686-NEXT: shrl $3, %eax +; i686-NEXT: andl $12, %eax +; i686-NEXT: movl 40(%esp,%eax), %edx +; i686-NEXT: movl 36(%esp,%eax), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; i686-NEXT: shrdl %cl, %edx, %esi ; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 56(%ebp), %edx -; i686-NEXT: movl %edx, %ebx -; i686-NEXT: andl $31, %ebx -; i686-NEXT: shrl $3, %edx -; i686-NEXT: andl $12, %edx -; i686-NEXT: movl 72(%esp,%edx), %esi -; i686-NEXT: movl 68(%esp,%edx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ebx, %ecx +; i686-NEXT: movl 32(%esp,%eax), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 44(%esp,%eax), %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: shrdl %cl, %eax, %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 56(%ebp), %ecx +; i686-NEXT: movl %ecx, %esi +; i686-NEXT: shrl $3, %esi +; i686-NEXT: andl $12, %esi +; i686-NEXT: movl 72(%esp,%esi), %edi +; i686-NEXT: movl 68(%esp,%esi), %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: shrdl %cl, %edi, %edx +; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 64(%esp,%esi), %ebx +; i686-NEXT: movl 76(%esp,%esi), %esi +; i686-NEXT: movl %ecx, %edx ; i686-NEXT: shrdl %cl, %esi, %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 64(%esp,%edx), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 76(%esp,%edx), %edx -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: shrdl %cl, %edx, %esi -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; i686-NEXT: shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill -; i686-NEXT: sarl %cl, (%esp) # 4-byte Folded Spill -; i686-NEXT: movl %ebx, %ecx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; i686-NEXT: movl 40(%ebp), %ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shrdl %cl, %eax, %edi -; i686-NEXT: sarl %cl, %edx -; i686-NEXT: movl 72(%ebp), %eax -; i686-NEXT: movl %edx, 28(%eax) -; i686-NEXT: movl %esi, 24(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 20(%eax) -; i686-NEXT: movl %edi, 16(%eax) -; i686-NEXT: movl (%esp), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 12(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 8(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 4(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: sarl %cl, %eax +; i686-NEXT: movl %edx, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: shrdl %cl, %edx, %ebx +; i686-NEXT: movl 56(%ebp), %ecx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: sarl %cl, %esi +; i686-NEXT: movl 72(%ebp), %ecx +; i686-NEXT: movl %esi, 28(%ecx) +; i686-NEXT: movl %edi, 24(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: movl %edx, 20(%ecx) +; i686-NEXT: movl %ebx, 16(%ecx) +; i686-NEXT: movl %eax, 12(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 8(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 4(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, (%ecx) ; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi @@ -514,8 +509,8 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: pushl %edi ; i686-NEXT: pushl %esi ; i686-NEXT: andl $-16, %esp -; i686-NEXT: subl $128, %esp -; i686-NEXT: movl 40(%ebp), %edi +; i686-NEXT: subl $112, %esp +; i686-NEXT: movl 40(%ebp), %ebx ; i686-NEXT: movl 24(%ebp), %eax ; i686-NEXT: movl 28(%ebp), %ecx ; i686-NEXT: movl 32(%ebp), %edx @@ -532,7 +527,6 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl %edx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; i686-NEXT: movl %eax, {{[0-9]+}}(%esp) -; i686-NEXT: movl %edi, %ebx ; i686-NEXT: shrl $3, %ebx ; i686-NEXT: andl $12, %ebx ; i686-NEXT: leal {{[0-9]+}}(%esp), %eax @@ -541,19 +535,16 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl (%eax), %esi -; i686-NEXT: movl 4(%eax), %edx -; i686-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl (%eax), %ecx +; i686-NEXT: movl %ecx, (%esp) # 4-byte Spill +; i686-NEXT: movl 4(%eax), %edi ; i686-NEXT: movl 8(%eax), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %edi, %ecx -; i686-NEXT: andl $31, %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 40(%ebp), %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shldl %cl, %edx, %eax +; i686-NEXT: shldl %cl, %edi, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 56(%ebp), %eax -; i686-NEXT: movl %eax, %edx +; i686-NEXT: movl 56(%ebp), %edx ; i686-NEXT: shrl $3, %edx ; i686-NEXT: andl $12, %edx ; i686-NEXT: leal {{[0-9]+}}(%esp), %ecx @@ -562,52 +553,53 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) ; i686-NEXT: movl $0, {{[0-9]+}}(%esp) -; i686-NEXT: movl (%ecx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 4(%ecx), %edi -; i686-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl 8(%ecx), %ecx -; i686-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: andl $31, %eax +; i686-NEXT: movl (%ecx), %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: movl %ecx, %eax -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl 4(%ecx), %esi +; i686-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 8(%ecx), %eax +; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl 56(%ebp), %ecx ; i686-NEXT: # kill: def $cl killed $cl killed $ecx -; i686-NEXT: shldl %cl, %edi, %eax +; i686-NEXT: shldl %cl, %esi, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; i686-NEXT: movl (%esp), %esi # 4-byte Reload ; i686-NEXT: movl %esi, %eax -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; i686-NEXT: movl 40(%ebp), %ecx ; i686-NEXT: shll %cl, %eax ; i686-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; i686-NEXT: shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: shldl %cl, %esi, %edi +; i686-NEXT: movl %edi, (%esp) # 4-byte Spill ; i686-NEXT: negl %ebx -; i686-NEXT: movl 76(%esp,%ebx), %ebx +; i686-NEXT: movl 60(%esp,%ebx), %ebx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: shldl %cl, %eax, %ebx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; i686-NEXT: shldl %cl, %esi, %ebx -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; i686-NEXT: movl %edi, %esi +; i686-NEXT: movl %esi, %edi +; i686-NEXT: movl 56(%ebp), %ecx +; i686-NEXT: shll %cl, %edi +; i686-NEXT: # kill: def $cl killed $cl killed $ecx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, %ecx -; i686-NEXT: shll %cl, %esi -; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill +; i686-NEXT: shldl %cl, %esi, %eax ; i686-NEXT: negl %edx -; i686-NEXT: movl 108(%esp,%edx), %edx +; i686-NEXT: movl 92(%esp,%edx), %edx +; i686-NEXT: movl 56(%ebp), %ecx +; i686-NEXT: # kill: def $cl killed $cl killed $ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; i686-NEXT: shldl %cl, %esi, %edx +; i686-NEXT: movl 72(%ebp), %ecx +; i686-NEXT: movl %edx, 28(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; i686-NEXT: movl %edx, 24(%ecx) +; i686-NEXT: movl %eax, 20(%ecx) +; i686-NEXT: movl %edi, 16(%ecx) +; i686-NEXT: movl %ebx, 12(%ecx) ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: shldl %cl, %eax, %edx -; i686-NEXT: movl 72(%ebp), %eax -; i686-NEXT: movl %edx, 28(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 24(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 20(%eax) -; i686-NEXT: movl %esi, 16(%eax) -; i686-NEXT: movl %ebx, 12(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 8(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, 4(%eax) -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; i686-NEXT: movl %ecx, (%eax) +; i686-NEXT: movl %eax, 8(%ecx) +; i686-NEXT: movl (%esp), %eax # 4-byte Reload +; i686-NEXT: movl %eax, 4(%ecx) +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; i686-NEXT: movl %eax, (%ecx) ; i686-NEXT: leal -12(%ebp), %esp ; i686-NEXT: popl %esi ; i686-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll index cc4bda81bef52..392beb72c7bfd 100644 --- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll @@ -380,7 +380,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind { ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm2 ; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: notl %eax +; AVX2-NEXT: notb %al ; AVX2-NEXT: vpextrb $8, %xmm1, %edx ; AVX2-NEXT: vpextrb $0, %xmm2, %ecx ; AVX2-NEXT: # kill: def $al killed $al killed $eax diff --git a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll index f7e053d384c99..f624122be8374 100644 --- a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll +++ b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll @@ -27,7 +27,7 @@ define i1 @test_spill_slot_size(i1 %a1, i2 %a2, i7 %a7, i8 %a8, i9 %a9, i15 %a15 ; CHECK-NEXT: andb $3, %sil ; CHECK-NEXT: movb %sil, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl %ebx, %eax -; CHECK-NEXT: andl $1, %eax +; CHECK-NEXT: andb $1, %al ; CHECK-NEXT: movb %al, {{[0-9]+}}(%rsp) ; CHECK-NEXT: andb $127, %dl ; CHECK-NEXT: movb %dl, {{[0-9]+}}(%rsp) diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll index 7d5db07c0172a..c05e8cb825797 100644 --- a/llvm/test/CodeGen/X86/subcarry.ll +++ b/llvm/test/CodeGen/X86/subcarry.ll @@ -46,13 +46,16 @@ define %S @negate(ptr nocapture readonly %this) { ; CHECK-NEXT: subq (%rsi), %rdx ; CHECK-NEXT: movl $0, %edi ; CHECK-NEXT: sbbq 8(%rsi), %rdi -; CHECK-NEXT: movl $0, %r8d -; CHECK-NEXT: sbbq 16(%rsi), %r8 -; CHECK-NEXT: sbbq 24(%rsi), %rcx +; CHECK-NEXT: sbbq 16(%rsi), %rcx +; CHECK-NEXT: setae %r8b +; CHECK-NEXT: movzbl %r8b, %r8d +; CHECK-NEXT: movq 24(%rsi), %rsi +; CHECK-NEXT: notq %rsi +; CHECK-NEXT: addq %r8, %rsi ; CHECK-NEXT: movq %rdx, (%rax) ; CHECK-NEXT: movq %rdi, 8(%rax) -; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %rcx, 24(%rax) +; CHECK-NEXT: movq %rcx, 16(%rax) +; CHECK-NEXT: movq %rsi, 24(%rax) ; CHECK-NEXT: retq entry: %0 = load i64, ptr %this, align 8 @@ -108,13 +111,13 @@ define %S @sub(ptr nocapture readonly %this, %S %arg.b) { ; CHECK-NEXT: movzbl %r10b, %r10d ; CHECK-NEXT: notq %r8 ; CHECK-NEXT: addq %rdx, %r8 -; CHECK-NEXT: adcq 24(%rsi), %r10 ; CHECK-NEXT: notq %r9 -; CHECK-NEXT: addq %r10, %r9 +; CHECK-NEXT: adcq 24(%rsi), %r10 +; CHECK-NEXT: addq %r9, %r10 ; CHECK-NEXT: movq %rdi, (%rax) ; CHECK-NEXT: movq %rcx, 8(%rax) ; CHECK-NEXT: movq %r8, 16(%rax) -; CHECK-NEXT: movq %r9, 24(%rax) +; CHECK-NEXT: movq %r10, 24(%rax) ; CHECK-NEXT: retq entry: %0 = extractvalue %S %arg.b, 0 diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll index 6a52acfe2fb30..18938b28be0a8 100644 --- a/llvm/test/CodeGen/X86/ucmp.ll +++ b/llvm/test/CodeGen/X86/ucmp.ll @@ -1479,7 +1479,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE4-NEXT: pushq %r12 ; SSE4-NEXT: pushq %rbx ; SSE4-NEXT: subq $120, %rsp -; SSE4-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1550,247 +1549,250 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE4-NEXT: andl $127, %r10d ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: andl $127, %ecx -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE4-NEXT: andl $127, %r8d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx -; SSE4-NEXT: andl $127, %ebx -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE4-NEXT: andl $127, %edx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: andl $127, %eax +; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13 ; SSE4-NEXT: andl $127, %r13d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE4-NEXT: andl $127, %r11d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE4-NEXT: andl $127, %r14d +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE4-NEXT: andl $127, %ebx ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; SSE4-NEXT: andl $127, %r12d -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: andl $127, %edx ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; SSE4-NEXT: cmpq %rax, %rbp -; SSE4-NEXT: movq %r12, %r15 -; SSE4-NEXT: sbbq %r14, %r15 -; SSE4-NEXT: setb %r15b -; SSE4-NEXT: cmpq %rbp, %rax -; SSE4-NEXT: sbbq %r12, %r14 -; SSE4-NEXT: sbbb $0, %r15b -; SSE4-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: andl $127, %ebp +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE4-NEXT: andl $127, %r8d ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: andl $127, %eax ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE4-NEXT: cmpq %rax, %r14 -; SSE4-NEXT: movq %r11, %r15 -; SSE4-NEXT: sbbq %r13, %r15 -; SSE4-NEXT: setb %bpl -; SSE4-NEXT: cmpq %r14, %rax -; SSE4-NEXT: sbbq %r11, %r13 -; SSE4-NEXT: sbbb $0, %bpl -; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE4-NEXT: cmpq %rax, %r11 -; SSE4-NEXT: movq %rdx, %r14 -; SSE4-NEXT: sbbq %rbx, %r14 -; SSE4-NEXT: setb %bpl -; SSE4-NEXT: cmpq %r11, %rax -; SSE4-NEXT: sbbq %rdx, %rbx -; SSE4-NEXT: sbbb $0, %bpl -; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: andl $127, %r14d +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE4-NEXT: cmpq %r15, %r10 +; SSE4-NEXT: movq %r14, %r11 +; SSE4-NEXT: sbbq %rax, %r11 +; SSE4-NEXT: setb %r11b +; SSE4-NEXT: cmpq %r10, %r15 +; SSE4-NEXT: sbbq %r14, %rax +; SSE4-NEXT: sbbb $0, %r11b +; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; SSE4-NEXT: cmpq %rax, %rdx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE4-NEXT: cmpq %rax, %r10 ; SSE4-NEXT: movq %r8, %r11 -; SSE4-NEXT: sbbq %rcx, %r11 +; SSE4-NEXT: sbbq %rbp, %r11 ; SSE4-NEXT: setb %r11b -; SSE4-NEXT: cmpq %rdx, %rax -; SSE4-NEXT: sbbq %r8, %rcx +; SSE4-NEXT: cmpq %r10, %rax +; SSE4-NEXT: sbbq %r8, %rbp ; SSE4-NEXT: sbbb $0, %r11b ; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: movq %r8, %rdx -; SSE4-NEXT: sbbq %r10, %rdx -; SSE4-NEXT: setb %dl -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r8, %r10 -; SSE4-NEXT: sbbb $0, %dl -; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rdx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE4-NEXT: cmpq %rax, %r8 +; SSE4-NEXT: movq %rdx, %r10 +; SSE4-NEXT: sbbq %r12, %r10 ; SSE4-NEXT: setb %r10b -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 +; SSE4-NEXT: cmpq %r8, %rax +; SSE4-NEXT: sbbq %rdx, %r12 ; SSE4-NEXT: sbbb $0, %r10b +; SSE4-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rax, %rdx +; SSE4-NEXT: movq %rbx, %r8 +; SSE4-NEXT: sbbq %r13, %r8 +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %rdx, %rax +; SSE4-NEXT: sbbq %rbx, %r13 +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rax, %rdx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rdx -; SSE4-NEXT: setb %dl -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 -; SSE4-NEXT: sbbb $0, %dl -; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: movq %r11, %r8 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %rdx, %rax +; SSE4-NEXT: sbbq %r11, %r10 +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rax, %rdx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rdx -; SSE4-NEXT: setb %bpl -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 -; SSE4-NEXT: sbbb $0, %bpl +; SSE4-NEXT: movq %r11, %r8 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %rdx, %rax +; SSE4-NEXT: sbbq %r11, %r10 +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rax, %rdx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE4-NEXT: movq %r11, %r8 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %rdx, %rax +; SSE4-NEXT: sbbq %r11, %r10 +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rax, %rdx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE4-NEXT: movq %r11, %r8 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %rdx, %rax +; SSE4-NEXT: sbbq %r11, %r10 +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE4-NEXT: cmpq %rax, %rdx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE4-NEXT: movq %r11, %rdx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rdx +; SSE4-NEXT: movq %r11, %r8 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; SSE4-NEXT: sbbq %r10, %r8 +; SSE4-NEXT: setb %r8b +; SSE4-NEXT: cmpq %rdx, %rax +; SSE4-NEXT: sbbq %r11, %r10 +; SSE4-NEXT: sbbb $0, %r8b +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE4-NEXT: cmpq %rax, %r10 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE4-NEXT: movq %rbx, %rdx +; SSE4-NEXT: movq (%rsp), %r11 # 8-byte Reload +; SSE4-NEXT: sbbq %r11, %rdx ; SSE4-NEXT: setb %dl -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r11, %r8 +; SSE4-NEXT: cmpq %r10, %rax +; SSE4-NEXT: sbbq %rbx, %r11 ; SSE4-NEXT: sbbb $0, %dl ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE4-NEXT: cmpq %rax, %r11 ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE4-NEXT: movq %r14, %r8 -; SSE4-NEXT: movq (%rsp), %rbx # 8-byte Reload -; SSE4-NEXT: sbbq %rbx, %r8 -; SSE4-NEXT: setb %r11b -; SSE4-NEXT: cmpq %rcx, %rax +; SSE4-NEXT: movq %r14, %r10 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE4-NEXT: sbbq %rbx, %r10 +; SSE4-NEXT: setb %r10b +; SSE4-NEXT: cmpq %r11, %rax ; SSE4-NEXT: sbbq %r14, %rbx +; SSE4-NEXT: sbbb $0, %r10b +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; SSE4-NEXT: cmpq %rax, %rbx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE4-NEXT: movq %r15, %r11 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE4-NEXT: sbbq %r14, %r11 +; SSE4-NEXT: setb %r11b +; SSE4-NEXT: cmpq %rbx, %rax +; SSE4-NEXT: sbbq %r15, %r14 ; SSE4-NEXT: sbbb $0, %r11b ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE4-NEXT: cmpq %rax, %r12 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE4-NEXT: movq %r15, %rbx ; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE4-NEXT: movq %r14, %rbx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rbx +; SSE4-NEXT: sbbq %r14, %rbx ; SSE4-NEXT: setb %bl -; SSE4-NEXT: cmpq %rcx, %rax -; SSE4-NEXT: sbbq %r14, %r8 +; SSE4-NEXT: cmpq %r12, %rax +; SSE4-NEXT: sbbq %r15, %r14 ; SSE4-NEXT: sbbb $0, %bl ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE4-NEXT: cmpq %rax, %r14 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE4-NEXT: movq %r15, %rcx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %rcx -; SSE4-NEXT: setb %cl -; SSE4-NEXT: cmpq %r14, %rax -; SSE4-NEXT: sbbq %r15, %r8 -; SSE4-NEXT: sbbb $0, %cl -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15 -; SSE4-NEXT: cmpq %rax, %r15 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: movq %r12, %r14 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %r14 -; SSE4-NEXT: setb %r14b -; SSE4-NEXT: cmpq %r15, %rax -; SSE4-NEXT: sbbq %r12, %r8 -; SSE4-NEXT: sbbb $0, %r14b -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: cmpq %r9, %rax -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: movq %r12, %r15 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %r15 -; SSE4-NEXT: setb %r15b +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE4-NEXT: movq %r15, %r12 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE4-NEXT: sbbq %r14, %r12 +; SSE4-NEXT: setb %r12b ; SSE4-NEXT: cmpq %rax, %r9 -; SSE4-NEXT: sbbq %r12, %r8 -; SSE4-NEXT: sbbb $0, %r15b +; SSE4-NEXT: sbbq %r15, %r14 +; SSE4-NEXT: sbbb $0, %r12b ; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE4-NEXT: cmpq %r12, %rax -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE4-NEXT: movq %r13, %r9 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: sbbq %r8, %r9 +; SSE4-NEXT: cmpq %rcx, %rax +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE4-NEXT: movq %r15, %r9 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE4-NEXT: sbbq %r14, %r9 ; SSE4-NEXT: setb %r9b -; SSE4-NEXT: cmpq %rax, %r12 -; SSE4-NEXT: sbbq %r13, %r8 -; SSE4-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE4-NEXT: cmpq %rax, %rcx +; SSE4-NEXT: sbbq %r15, %r14 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE4-NEXT: sbbb $0, %r9b -; SSE4-NEXT: cmpq %rsi, %r12 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: movq %r8, %rdi -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE4-NEXT: sbbq %rax, %rdi -; SSE4-NEXT: setb %dil -; SSE4-NEXT: cmpq %r12, %rsi -; SSE4-NEXT: sbbq %r8, %rax -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE4-NEXT: sbbb $0, %dil -; SSE4-NEXT: cmpq %r12, %r13 -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE4-NEXT: movq %r8, %rsi -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE4-NEXT: sbbq %rax, %rsi -; SSE4-NEXT: setb %sil -; SSE4-NEXT: cmpq %r13, %r12 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm1 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm2 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm3 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm4 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload -; SSE4-NEXT: movd %r12d, %xmm5 -; SSE4-NEXT: movzbl %r10b, %r10d -; SSE4-NEXT: movd %r10d, %xmm6 -; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload -; SSE4-NEXT: movd %r10d, %xmm7 -; SSE4-NEXT: movzbl %bpl, %r10d -; SSE4-NEXT: movd %r10d, %xmm0 +; SSE4-NEXT: cmpq %rsi, %rax +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE4-NEXT: movq %r15, %rcx +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; SSE4-NEXT: sbbq %r14, %rcx +; SSE4-NEXT: setb %r13b +; SSE4-NEXT: cmpq %rax, %rsi +; SSE4-NEXT: sbbq %r15, %r14 +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; SSE4-NEXT: sbbb $0, %r13b +; SSE4-NEXT: cmpq %rsi, %r14 +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload +; SSE4-NEXT: movq %rbp, %rax +; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE4-NEXT: sbbq %r15, %rax +; SSE4-NEXT: movq %rdi, %rax +; SSE4-NEXT: setb %cl +; SSE4-NEXT: cmpq %r14, %rsi +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm0 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm1 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm2 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm3 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm4 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm5 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm6 +; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE4-NEXT: movd %esi, %xmm7 +; SSE4-NEXT: movzbl %r8b, %esi +; SSE4-NEXT: movd %esi, %xmm8 ; SSE4-NEXT: movzbl %dl, %edx -; SSE4-NEXT: movd %edx, %xmm8 -; SSE4-NEXT: movzbl %r11b, %edx ; SSE4-NEXT: movd %edx, %xmm9 -; SSE4-NEXT: movzbl %bl, %edx +; SSE4-NEXT: movzbl %r10b, %edx ; SSE4-NEXT: movd %edx, %xmm10 -; SSE4-NEXT: movzbl %cl, %ecx -; SSE4-NEXT: movd %ecx, %xmm11 -; SSE4-NEXT: movzbl %r14b, %ecx -; SSE4-NEXT: movd %ecx, %xmm12 -; SSE4-NEXT: movzbl %r15b, %ecx -; SSE4-NEXT: movd %ecx, %xmm13 -; SSE4-NEXT: movzbl %r9b, %ecx -; SSE4-NEXT: movd %ecx, %xmm14 -; SSE4-NEXT: movzbl %dil, %ecx -; SSE4-NEXT: movd %ecx, %xmm15 -; SSE4-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] -; SSE4-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] -; SSE4-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3] -; SSE4-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] -; SSE4-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7] -; SSE4-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] -; SSE4-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1] +; SSE4-NEXT: movzbl %r11b, %edx +; SSE4-NEXT: movd %edx, %xmm11 +; SSE4-NEXT: movzbl %bl, %edx +; SSE4-NEXT: movd %edx, %xmm12 +; SSE4-NEXT: movzbl %r12b, %edx +; SSE4-NEXT: movd %edx, %xmm13 +; SSE4-NEXT: movzbl %r9b, %edx +; SSE4-NEXT: movd %edx, %xmm14 +; SSE4-NEXT: movzbl %r13b, %edx +; SSE4-NEXT: movd %edx, %xmm15 +; SSE4-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE4-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] +; SSE4-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE4-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] +; SSE4-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] +; SSE4-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3] +; SSE4-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1] ; SSE4-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7] ; SSE4-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7] ; SSE4-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3] @@ -1798,13 +1800,11 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE4-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] ; SSE4-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3] ; SSE4-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1] -; SSE4-NEXT: sbbq %r8, %rax -; SSE4-NEXT: sbbb $0, %sil -; SSE4-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0] -; SSE4-NEXT: movzbl %sil, %ecx -; SSE4-NEXT: andl $3, %ecx -; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; SSE4-NEXT: movb %cl, 4(%rax) +; SSE4-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0] +; SSE4-NEXT: sbbq %rbp, %r15 +; SSE4-NEXT: sbbb $0, %cl +; SSE4-NEXT: andb $3, %cl +; SSE4-NEXT: movb %cl, 4(%rdi) ; SSE4-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp) ; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx ; SSE4-NEXT: andl $3, %ecx @@ -1886,7 +1886,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: pushq %r12 ; SSE2-NEXT: pushq %rbx ; SSE2-NEXT: subq $88, %rsp -; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -1914,6 +1913,9 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax +; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax @@ -1957,161 +1959,160 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: andl $127, %ecx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: andl $127, %eax ; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; SSE2-NEXT: andl $127, %r13d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: andl $127, %ebx ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: andl $127, %edx -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; SSE2-NEXT: andl $127, %r10d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: andl $127, %r8d +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 +; SSE2-NEXT: andl $127, %r12d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: andl $127, %r14d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; SSE2-NEXT: andl $127, %ebp -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; SSE2-NEXT: andl $127, %r13d -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; SSE2-NEXT: andl $127, %r11d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; SSE2-NEXT: andl $127, %r15d ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12 -; SSE2-NEXT: cmpq %rax, %r12 -; SSE2-NEXT: movq %r15, %r8 -; SSE2-NEXT: sbbq %r11, %r8 -; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %r12, %rax -; SSE2-NEXT: sbbq %r15, %r11 -; SSE2-NEXT: sbbb $0, %r8b -; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: cmpq %rax, %r11 +; SSE2-NEXT: movq %r15, %r10 +; SSE2-NEXT: sbbq %rbp, %r10 +; SSE2-NEXT: setb %r10b +; SSE2-NEXT: cmpq %r11, %rax +; SSE2-NEXT: sbbq %r15, %rbp +; SSE2-NEXT: sbbb $0, %r10b +; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %r13, %r11 -; SSE2-NEXT: sbbq %rbp, %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: cmpq %rax, %r10 +; SSE2-NEXT: movq %r14, %r11 +; SSE2-NEXT: sbbq %r12, %r11 ; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %r13, %rbp +; SSE2-NEXT: cmpq %r10, %rax +; SSE2-NEXT: sbbq %r14, %r12 ; SSE2-NEXT: sbbb $0, %r11b ; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %r14, %r11 -; SSE2-NEXT: sbbq %r10, %r11 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: cmpq %rax, %r10 +; SSE2-NEXT: movq %r8, %r11 +; SSE2-NEXT: sbbq %rdx, %r11 ; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %r14, %r10 +; SSE2-NEXT: cmpq %r10, %rax +; SSE2-NEXT: sbbq %r8, %rdx ; SSE2-NEXT: sbbb $0, %r11b ; SSE2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; SSE2-NEXT: cmpq %rax, %r8 -; SSE2-NEXT: movq %rdx, %r10 -; SSE2-NEXT: sbbq %rbx, %r10 -; SSE2-NEXT: setb %r10b -; SSE2-NEXT: cmpq %r8, %rax -; SSE2-NEXT: sbbq %rdx, %rbx -; SSE2-NEXT: sbbb $0, %r10b -; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; SSE2-NEXT: cmpq %rax, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %r8 -; SSE2-NEXT: sbbq %rcx, %r8 +; SSE2-NEXT: movq %rbx, %r8 +; SSE2-NEXT: sbbq %r13, %r8 ; SSE2-NEXT: setb %r8b ; SSE2-NEXT: cmpq %rdx, %rax -; SSE2-NEXT: sbbq %r10, %rcx +; SSE2-NEXT: sbbq %rbx, %r13 ; SSE2-NEXT: sbbb $0, %r8b ; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: movq %r11, %r8 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE2-NEXT: sbbq %r8, %rdx -; SSE2-NEXT: setb %dl -; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: sbbq %r10, %r8 -; SSE2-NEXT: sbbb $0, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: setb %r8b +; SSE2-NEXT: cmpq %rdx, %rax +; SSE2-NEXT: sbbq %r11, %r10 +; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: movq %r11, %r8 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: movq %r10, %rdx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; SSE2-NEXT: sbbq %r8, %rdx -; SSE2-NEXT: setb %dl -; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: sbbq %r10, %r8 -; SSE2-NEXT: sbbb $0, %dl -; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; SSE2-NEXT: setb %r8b +; SSE2-NEXT: cmpq %rdx, %rax +; SSE2-NEXT: sbbq %r11, %r10 +; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; SSE2-NEXT: movq %r11, %rdx +; SSE2-NEXT: movq %r11, %r8 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx +; SSE2-NEXT: sbbq %r10, %r8 ; SSE2-NEXT: setb %r8b -; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: cmpq %rdx, %rax ; SSE2-NEXT: sbbq %r11, %r10 ; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: movq %rbx, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; SSE2-NEXT: cmpq %rax, %rdx +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: movq %r11, %r8 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; SSE2-NEXT: sbbq %r10, %rdx -; SSE2-NEXT: setb %r11b -; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %rbx, %r10 -; SSE2-NEXT: sbbb $0, %r11b +; SSE2-NEXT: sbbq %r10, %r8 +; SSE2-NEXT: setb %bpl +; SSE2-NEXT: cmpq %rdx, %rax +; SSE2-NEXT: sbbq %r11, %r10 +; SSE2-NEXT: sbbb $0, %bpl ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; SSE2-NEXT: movq %rbx, %rdx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; SSE2-NEXT: cmpq %rax, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: movq %r11, %rdx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; SSE2-NEXT: sbbq %r10, %rdx ; SSE2-NEXT: setb %dl -; SSE2-NEXT: cmpq %rcx, %rax -; SSE2-NEXT: sbbq %rbx, %r10 +; SSE2-NEXT: cmpq %r8, %rax +; SSE2-NEXT: sbbq %r11, %r10 ; SSE2-NEXT: sbbb $0, %dl ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; SSE2-NEXT: cmpq %rax, %r10 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload +; SSE2-NEXT: movq %rbx, %r8 +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; SSE2-NEXT: sbbq %r11, %r8 +; SSE2-NEXT: setb %r8b +; SSE2-NEXT: cmpq %r10, %rax +; SSE2-NEXT: sbbq %rbx, %r11 +; SSE2-NEXT: sbbb $0, %r8b +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; SSE2-NEXT: cmpq %rax, %r11 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload ; SSE2-NEXT: movq %r14, %r10 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload ; SSE2-NEXT: sbbq %rbx, %r10 ; SSE2-NEXT: setb %r10b -; SSE2-NEXT: cmpq %rcx, %rax +; SSE2-NEXT: cmpq %r11, %rax ; SSE2-NEXT: sbbq %r14, %rbx ; SSE2-NEXT: sbbb $0, %r10b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; SSE2-NEXT: cmpq %rax, %rbx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: movq %r15, %rcx -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %rcx -; SSE2-NEXT: setb %cl +; SSE2-NEXT: movq %r15, %r11 +; SSE2-NEXT: movq (%rsp), %r14 # 8-byte Reload +; SSE2-NEXT: sbbq %r14, %r11 +; SSE2-NEXT: setb %r11b ; SSE2-NEXT: cmpq %rbx, %rax ; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: sbbb $0, %cl +; SSE2-NEXT: sbbb $0, %r11b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; SSE2-NEXT: cmpq %rax, %r14 -; SSE2-NEXT: movq (%rsp), %r12 # 8-byte Reload +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload ; SSE2-NEXT: movq %r12, %rbx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; SSE2-NEXT: sbbq %r15, %rbx @@ -2125,52 +2126,48 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: movq %r12, %r14 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: setb %bpl +; SSE2-NEXT: setb %r14b ; SSE2-NEXT: cmpq %rax, %r9 ; SSE2-NEXT: sbbq %r12, %r15 -; SSE2-NEXT: sbbb $0, %bpl +; SSE2-NEXT: sbbb $0, %r14b ; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: cmpq %rsi, %rax +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; SSE2-NEXT: movq %r12, %r9 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: movq %r15, %r9 -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %r9 +; SSE2-NEXT: sbbq %r15, %r9 ; SSE2-NEXT: setb %r9b ; SSE2-NEXT: cmpq %rax, %rsi -; SSE2-NEXT: sbbq %r15, %r14 -; SSE2-NEXT: movq %rdi, %rax -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; SSE2-NEXT: sbbq %r12, %r15 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; SSE2-NEXT: sbbb $0, %r9b -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; SSE2-NEXT: cmpq %r15, %rsi +; SSE2-NEXT: cmpq %rcx, %rax ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: movq %r12, %rdi -; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; SSE2-NEXT: sbbq %r14, %rdi -; SSE2-NEXT: setb %dil -; SSE2-NEXT: cmpq %rsi, %r15 -; SSE2-NEXT: sbbq %r12, %r14 -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14 -; SSE2-NEXT: sbbb $0, %dil -; SSE2-NEXT: cmpq %rsi, %r14 +; SSE2-NEXT: movq %r12, %rsi +; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; SSE2-NEXT: sbbq %r15, %rsi +; SSE2-NEXT: setb %sil +; SSE2-NEXT: cmpq %rax, %rcx +; SSE2-NEXT: sbbq %r12, %r15 +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax +; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; SSE2-NEXT: sbbb $0, %sil +; SSE2-NEXT: cmpq %rax, %r15 ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; SSE2-NEXT: movq %r13, %r15 +; SSE2-NEXT: movq %r13, %rcx ; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; SSE2-NEXT: sbbq %r12, %r15 -; SSE2-NEXT: setb %r15b -; SSE2-NEXT: cmpq %r14, %rsi +; SSE2-NEXT: sbbq %r12, %rcx +; SSE2-NEXT: setb %cl +; SSE2-NEXT: cmpq %r15, %rax ; SSE2-NEXT: sbbq %r13, %r12 -; SSE2-NEXT: sbbb $0, %r15b -; SSE2-NEXT: movzbl %r15b, %esi -; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: movb %sil, 4(%rax) -; SSE2-NEXT: movzbl %dil, %esi +; SSE2-NEXT: movq %rdi, %rax +; SSE2-NEXT: movzbl %sil, %esi +; SSE2-NEXT: sbbb $0, %cl ; SSE2-NEXT: movzbl %r9b, %edi ; SSE2-NEXT: andl $3, %esi ; SSE2-NEXT: andl $3, %edi ; SSE2-NEXT: leaq (%rdi,%rsi,4), %rsi -; SSE2-NEXT: movzbl %bpl, %edi +; SSE2-NEXT: movzbl %r14b, %edi ; SSE2-NEXT: andl $3, %edi ; SSE2-NEXT: shll $4, %edi ; SSE2-NEXT: orq %rsi, %rdi @@ -2178,55 +2175,57 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; SSE2-NEXT: andl $3, %r9d ; SSE2-NEXT: shll $6, %r9d ; SSE2-NEXT: orq %rdi, %r9 -; SSE2-NEXT: movzbl %cl, %esi +; SSE2-NEXT: movzbl %r11b, %esi ; SSE2-NEXT: andl $3, %esi ; SSE2-NEXT: shll $8, %esi ; SSE2-NEXT: orq %r9, %rsi -; SSE2-NEXT: movzbl %dl, %ecx -; SSE2-NEXT: movzbl %r10b, %edx -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $10, %edx -; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shll $12, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: movzbl %r11b, %edx +; SSE2-NEXT: movzbl %r8b, %edi +; SSE2-NEXT: movzbl %r10b, %r8d +; SSE2-NEXT: andl $3, %r8d +; SSE2-NEXT: shll $10, %r8d +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: shll $12, %edi +; SSE2-NEXT: orq %r8, %rdi +; SSE2-NEXT: movzbl %dl, %r8d +; SSE2-NEXT: andl $3, %r8d +; SSE2-NEXT: shll $14, %r8d +; SSE2-NEXT: orq %rdi, %r8 +; SSE2-NEXT: movzbl %bpl, %edx ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $14, %edx -; SSE2-NEXT: orq %rcx, %rdx -; SSE2-NEXT: movzbl %r8b, %ecx -; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shll $16, %ecx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: shll $16, %edx +; SSE2-NEXT: orq %r8, %rdx +; SSE2-NEXT: orq %rsi, %rdx ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: shll $18, %edi ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $18, %esi -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $20, %edx -; SSE2-NEXT: orq %rsi, %rdx +; SSE2-NEXT: shll $20, %esi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: shll $22, %edi +; SSE2-NEXT: orq %rsi, %rdi ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shll $22, %esi -; SSE2-NEXT: orq %rdx, %rsi +; SSE2-NEXT: shll $24, %esi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; SSE2-NEXT: andl $3, %edi +; SSE2-NEXT: shlq $26, %rdi +; SSE2-NEXT: orq %rsi, %rdi +; SSE2-NEXT: orq %rdx, %rdi ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shll $24, %edx -; SSE2-NEXT: orq %rsi, %rdx ; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; SSE2-NEXT: andl $3, %esi -; SSE2-NEXT: shlq $26, %rsi -; SSE2-NEXT: orq %rdx, %rsi -; SSE2-NEXT: orq %rcx, %rsi -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; SSE2-NEXT: shlq $28, %rsi ; SSE2-NEXT: andl $3, %edx -; SSE2-NEXT: shlq $28, %rdx -; SSE2-NEXT: andl $3, %ecx -; SSE2-NEXT: shlq $30, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: orq %rsi, %rcx -; SSE2-NEXT: movl %ecx, (%rax) +; SSE2-NEXT: shlq $30, %rdx +; SSE2-NEXT: orq %rsi, %rdx +; SSE2-NEXT: orq %rdi, %rdx +; SSE2-NEXT: andb $3, %cl +; SSE2-NEXT: movb %cl, 4(%rax) +; SSE2-NEXT: movl %edx, (%rax) ; SSE2-NEXT: addq $88, %rsp ; SSE2-NEXT: popq %rbx ; SSE2-NEXT: popq %r12 @@ -2321,62 +2320,62 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: andl $127, %eax ; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX2-NEXT: andl $127, %r10d ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX2-NEXT: andl $127, %r15d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX2-NEXT: andl $127, %edx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 +; AVX2-NEXT: andl $127, %r13d ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: andl $127, %eax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX2-NEXT: andl $127, %r14d -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: andl $127, %edx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp -; AVX2-NEXT: andl $127, %ebp -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: andl $127, %r8d ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12 ; AVX2-NEXT: andl $127, %r12d -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13 -; AVX2-NEXT: andl $127, %r13d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: andl $127, %r8d +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx +; AVX2-NEXT: cmpq %rbp, %rbx +; AVX2-NEXT: movq %r8, %r11 +; AVX2-NEXT: sbbq %r12, %r11 +; AVX2-NEXT: setb %r11b +; AVX2-NEXT: cmpq %rbx, %rbp +; AVX2-NEXT: sbbq %r8, %r12 +; AVX2-NEXT: sbbb $0, %r11b +; AVX2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: cmpq %rbx, %r11 -; AVX2-NEXT: movq %r13, %r10 -; AVX2-NEXT: sbbq %r12, %r10 -; AVX2-NEXT: setb %r10b -; AVX2-NEXT: cmpq %r11, %rbx -; AVX2-NEXT: sbbq %r13, %r12 -; AVX2-NEXT: sbbb $0, %r10b -; AVX2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX2-NEXT: cmpq %r10, %r11 -; AVX2-NEXT: movq %r8, %rbx -; AVX2-NEXT: sbbq %rbp, %rbx +; AVX2-NEXT: cmpq %r8, %r11 +; AVX2-NEXT: movq %r14, %rbx +; AVX2-NEXT: sbbq %rax, %rbx ; AVX2-NEXT: setb %bl -; AVX2-NEXT: cmpq %r11, %r10 -; AVX2-NEXT: sbbq %r8, %rbp +; AVX2-NEXT: cmpq %r11, %r8 +; AVX2-NEXT: sbbq %r14, %rax ; AVX2-NEXT: sbbb $0, %bl ; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: cmpq %r8, %r10 -; AVX2-NEXT: movq %rdx, %r11 -; AVX2-NEXT: sbbq %r14, %r11 +; AVX2-NEXT: cmpq %rax, %r8 +; AVX2-NEXT: movq %r13, %r11 +; AVX2-NEXT: sbbq %rdx, %r11 ; AVX2-NEXT: setb %r11b -; AVX2-NEXT: cmpq %r10, %r8 -; AVX2-NEXT: sbbq %rdx, %r14 +; AVX2-NEXT: cmpq %r8, %rax +; AVX2-NEXT: sbbq %r13, %rdx ; AVX2-NEXT: sbbb $0, %r11b ; AVX2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX2-NEXT: cmpq %rdx, %r8 -; AVX2-NEXT: movq %rax, %r10 +; AVX2-NEXT: cmpq %rax, %rdx +; AVX2-NEXT: movq %r15, %r8 +; AVX2-NEXT: sbbq %r10, %r8 +; AVX2-NEXT: setb %r8b +; AVX2-NEXT: cmpq %rdx, %rax ; AVX2-NEXT: sbbq %r15, %r10 -; AVX2-NEXT: setb %r10b -; AVX2-NEXT: cmpq %r8, %rdx -; AVX2-NEXT: sbbq %rax, %r15 -; AVX2-NEXT: sbbb $0, %r10b -; AVX2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; AVX2-NEXT: sbbb $0, %r8b +; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx ; AVX2-NEXT: cmpq %rax, %rdx @@ -2420,43 +2419,43 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: movq %r11, %r8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload ; AVX2-NEXT: sbbq %r10, %r8 -; AVX2-NEXT: setb %r12b +; AVX2-NEXT: setb %bpl ; AVX2-NEXT: cmpq %rdx, %rax ; AVX2-NEXT: sbbq %r11, %r10 -; AVX2-NEXT: sbbb $0, %r12b +; AVX2-NEXT: sbbb $0, %bpl ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx -; AVX2-NEXT: cmpq %rax, %rdx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: cmpq %rax, %r8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: movq %r11, %r8 +; AVX2-NEXT: movq %r11, %rdx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX2-NEXT: sbbq %r10, %r8 -; AVX2-NEXT: setb %r8b -; AVX2-NEXT: cmpq %rdx, %rax +; AVX2-NEXT: sbbq %r10, %rdx +; AVX2-NEXT: setb %dl +; AVX2-NEXT: cmpq %r8, %rax ; AVX2-NEXT: sbbq %r11, %r10 -; AVX2-NEXT: sbbb $0, %r8b +; AVX2-NEXT: sbbb $0, %dl ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX2-NEXT: cmpq %rax, %r10 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8 +; AVX2-NEXT: cmpq %rax, %r8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: movq %rbx, %rdx +; AVX2-NEXT: movq %rbx, %r10 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX2-NEXT: sbbq %r11, %rdx -; AVX2-NEXT: setb %dl -; AVX2-NEXT: cmpq %r10, %rax +; AVX2-NEXT: sbbq %r11, %r10 +; AVX2-NEXT: setb %r10b +; AVX2-NEXT: cmpq %r8, %rax ; AVX2-NEXT: sbbq %rbx, %r11 -; AVX2-NEXT: sbbb $0, %dl +; AVX2-NEXT: sbbb $0, %r10b ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11 ; AVX2-NEXT: cmpq %rax, %r11 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: movq %r14, %r10 +; AVX2-NEXT: movq %r14, %r8 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX2-NEXT: sbbq %rbx, %r10 -; AVX2-NEXT: setb %r10b +; AVX2-NEXT: sbbq %rbx, %r8 +; AVX2-NEXT: setb %r8b ; AVX2-NEXT: cmpq %r11, %rax ; AVX2-NEXT: sbbq %r14, %rbx -; AVX2-NEXT: sbbb $0, %r10b +; AVX2-NEXT: sbbb $0, %r8b ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx ; AVX2-NEXT: cmpq %rax, %rbx @@ -2471,121 +2470,121 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14 ; AVX2-NEXT: cmpq %rax, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq %r13, %rbx +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: movq %r12, %rbx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; AVX2-NEXT: sbbq %r15, %rbx ; AVX2-NEXT: setb %bl ; AVX2-NEXT: cmpq %r14, %rax -; AVX2-NEXT: sbbq %r13, %r15 +; AVX2-NEXT: sbbq %r12, %r15 ; AVX2-NEXT: sbbb $0, %bl ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: cmpq %r9, %rax -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq %r13, %r14 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: movq %r12, %r14 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload ; AVX2-NEXT: sbbq %r15, %r14 -; AVX2-NEXT: setb %bpl +; AVX2-NEXT: setb %r14b ; AVX2-NEXT: cmpq %rax, %r9 -; AVX2-NEXT: sbbq %r13, %r15 -; AVX2-NEXT: sbbb $0, %bpl +; AVX2-NEXT: sbbq %r12, %r15 +; AVX2-NEXT: sbbb $0, %r14b ; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax ; AVX2-NEXT: cmpq %rsi, %rax +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload +; AVX2-NEXT: movq %r12, %r9 ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: movq %r15, %r9 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: sbbq %r14, %r9 +; AVX2-NEXT: sbbq %r15, %r9 ; AVX2-NEXT: setb %r9b ; AVX2-NEXT: cmpq %rax, %rsi -; AVX2-NEXT: sbbq %r15, %r14 +; AVX2-NEXT: sbbq %r12, %r15 +; AVX2-NEXT: movq %rdi, %r12 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi ; AVX2-NEXT: sbbb $0, %r9b -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: cmpq %rcx, %rax +; AVX2-NEXT: cmpq %rcx, %rsi +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: movq %rax, %rdi ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: movq %r15, %rsi -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX2-NEXT: sbbq %r14, %rsi -; AVX2-NEXT: setb %sil -; AVX2-NEXT: cmpq %rax, %rcx -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax -; AVX2-NEXT: sbbq %r15, %r14 -; AVX2-NEXT: sbbb $0, %sil -; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx -; AVX2-NEXT: cmpq %rax, %rcx +; AVX2-NEXT: sbbq %r15, %rdi +; AVX2-NEXT: setb %dil +; AVX2-NEXT: cmpq %rsi, %rcx +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi +; AVX2-NEXT: sbbq %rax, %r15 +; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15 +; AVX2-NEXT: sbbb $0, %dil +; AVX2-NEXT: cmpq %rsi, %r15 +; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload -; AVX2-NEXT: movq %r13, %r14 -; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX2-NEXT: sbbq %r15, %r14 -; AVX2-NEXT: setb %r14b -; AVX2-NEXT: cmpq %rcx, %rax -; AVX2-NEXT: sbbq %r13, %r15 -; AVX2-NEXT: movq %rdi, %rax -; AVX2-NEXT: sbbb $0, %r14b -; AVX2-NEXT: movzbl %r14b, %ecx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: movb %cl, 4(%rdi) -; AVX2-NEXT: movzbl %sil, %ecx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: movzbl %r9b, %esi -; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: leaq (%rsi,%rcx,4), %rcx -; AVX2-NEXT: movzbl %bpl, %esi +; AVX2-NEXT: sbbq %r13, %rcx +; AVX2-NEXT: setb %cl +; AVX2-NEXT: cmpq %r15, %rsi +; AVX2-NEXT: sbbq %rax, %r13 +; AVX2-NEXT: sbbb $0, %cl +; AVX2-NEXT: movzbl %dil, %esi ; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: shll $4, %esi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: movzbl %bl, %ecx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: shll $6, %ecx -; AVX2-NEXT: orq %rsi, %rcx +; AVX2-NEXT: movzbl %r9b, %edi +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: leaq (%rdi,%rsi,4), %rsi +; AVX2-NEXT: movzbl %r14b, %edi +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: shll $4, %edi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: movzbl %bl, %r9d +; AVX2-NEXT: andl $3, %r9d +; AVX2-NEXT: shll $6, %r9d +; AVX2-NEXT: orq %rdi, %r9 ; AVX2-NEXT: movzbl %r11b, %esi ; AVX2-NEXT: andl $3, %esi ; AVX2-NEXT: shll $8, %esi -; AVX2-NEXT: orq %rcx, %rsi -; AVX2-NEXT: movzbl %r10b, %ecx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: shll $10, %ecx -; AVX2-NEXT: movzbl %dl, %edx -; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: shll $12, %edx -; AVX2-NEXT: orq %rcx, %rdx +; AVX2-NEXT: orq %r9, %rsi ; AVX2-NEXT: movzbl %r8b, %edi ; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: shll $10, %edi +; AVX2-NEXT: movzbl %r10b, %r8d +; AVX2-NEXT: andl $3, %r8d +; AVX2-NEXT: shll $12, %r8d +; AVX2-NEXT: orq %rdi, %r8 +; AVX2-NEXT: movzbl %dl, %edi +; AVX2-NEXT: andl $3, %edi ; AVX2-NEXT: shll $14, %edi -; AVX2-NEXT: orq %rdx, %rdi -; AVX2-NEXT: movzbl %r12b, %ecx -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: shll $16, %ecx -; AVX2-NEXT: orq %rdi, %rcx -; AVX2-NEXT: orq %rsi, %rcx -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: orq %r8, %rdi +; AVX2-NEXT: movzbl %bpl, %edx ; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: shll $18, %edx +; AVX2-NEXT: shll $16, %edx +; AVX2-NEXT: orq %rdi, %rdx +; AVX2-NEXT: orq %rsi, %rdx ; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: shll $20, %esi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: shll $22, %edx -; AVX2-NEXT: orq %rsi, %rdx +; AVX2-NEXT: shll $18, %esi +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: shll $20, %edi +; AVX2-NEXT: orq %rsi, %rdi ; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: shll $24, %esi -; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX2-NEXT: andl $3, %edx -; AVX2-NEXT: shlq $26, %rdx -; AVX2-NEXT: orq %rsi, %rdx -; AVX2-NEXT: orq %rcx, %rdx -; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX2-NEXT: andl $3, %ecx -; AVX2-NEXT: shlq $28, %rcx +; AVX2-NEXT: shll $22, %esi +; AVX2-NEXT: orq %rdi, %rsi +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: shll $24, %edi +; AVX2-NEXT: orq %rsi, %rdi ; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload ; AVX2-NEXT: andl $3, %esi -; AVX2-NEXT: shlq $30, %rsi -; AVX2-NEXT: orq %rcx, %rsi +; AVX2-NEXT: shlq $26, %rsi +; AVX2-NEXT: orq %rdi, %rsi ; AVX2-NEXT: orq %rdx, %rsi -; AVX2-NEXT: movl %esi, (%rax) +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload +; AVX2-NEXT: andl $3, %edx +; AVX2-NEXT: shlq $28, %rdx +; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload +; AVX2-NEXT: andl $3, %edi +; AVX2-NEXT: shlq $30, %rdi +; AVX2-NEXT: orq %rdx, %rdi +; AVX2-NEXT: orq %rsi, %rdi +; AVX2-NEXT: andb $3, %cl +; AVX2-NEXT: movb %cl, 4(%r12) +; AVX2-NEXT: movl %edi, (%r12) +; AVX2-NEXT: movq %r12, %rax ; AVX2-NEXT: addq $88, %rsp ; AVX2-NEXT: popq %rbx ; AVX2-NEXT: popq %r12 @@ -2789,161 +2788,159 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; AVX512-NEXT: cmpq %rdx, %rcx ; AVX512-NEXT: sbbq %rdi, %rsi ; AVX512-NEXT: sbbb $0, %r9b -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: cmpq %rdx, %rsi +; AVX512-NEXT: cmpq %rcx, %rsi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload -; AVX512-NEXT: movq %rdi, %rcx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: sbbq %rax, %rcx -; AVX512-NEXT: setb %cl -; AVX512-NEXT: cmpq %rsi, %rdx -; AVX512-NEXT: sbbq %rdi, %rax -; AVX512-NEXT: sbbb $0, %cl -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: cmpq %rsi, %rdi -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload -; AVX512-NEXT: movq %r8, %rdx +; AVX512-NEXT: movq %rdi, %rdx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: sbbq %rax, %rdx ; AVX512-NEXT: setb %dl -; AVX512-NEXT: cmpq %rdi, %rsi -; AVX512-NEXT: sbbq %r8, %rax +; AVX512-NEXT: cmpq %rsi, %rcx +; AVX512-NEXT: sbbq %rdi, %rax ; AVX512-NEXT: sbbb $0, %dl +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: cmpq %rdi, %r8 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload -; AVX512-NEXT: movq %r10, %rsi +; AVX512-NEXT: cmpq %rcx, %rdi +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload +; AVX512-NEXT: movq %r8, %rsi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: sbbq %rax, %rsi ; AVX512-NEXT: setb %sil -; AVX512-NEXT: cmpq %r8, %rdi -; AVX512-NEXT: sbbq %r10, %rax +; AVX512-NEXT: cmpq %rdi, %rcx +; AVX512-NEXT: sbbq %r8, %rax ; AVX512-NEXT: sbbb $0, %sil +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 -; AVX512-NEXT: cmpq %r8, %r10 -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload -; AVX512-NEXT: movq %r11, %rdi +; AVX512-NEXT: cmpq %rcx, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload +; AVX512-NEXT: movq %r10, %rdi ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: sbbq %rax, %rdi ; AVX512-NEXT: setb %dil -; AVX512-NEXT: cmpq %r10, %r8 -; AVX512-NEXT: sbbq %r11, %rax +; AVX512-NEXT: cmpq %r8, %rcx +; AVX512-NEXT: sbbq %r10, %rax ; AVX512-NEXT: sbbb $0, %dil +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10 +; AVX512-NEXT: cmpq %rcx, %r10 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload +; AVX512-NEXT: movq %r11, %r8 +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: sbbq %rax, %r8 +; AVX512-NEXT: setb %r8b +; AVX512-NEXT: cmpq %r10, %rcx +; AVX512-NEXT: sbbq %r11, %rax +; AVX512-NEXT: sbbb $0, %r8b +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: cmpq %rax, %r10 +; AVX512-NEXT: cmpq %rax, %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: movq %rbx, %r8 +; AVX512-NEXT: movq %rbx, %r10 ; AVX512-NEXT: movq (%rsp), %r11 # 8-byte Reload -; AVX512-NEXT: sbbq %r11, %r8 -; AVX512-NEXT: setb %r8b -; AVX512-NEXT: cmpq %r10, %rax +; AVX512-NEXT: sbbq %r11, %r10 +; AVX512-NEXT: setb %r10b +; AVX512-NEXT: cmpq %rcx, %rax ; AVX512-NEXT: sbbq %rbx, %r11 -; AVX512-NEXT: sbbb $0, %r8b -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 +; AVX512-NEXT: sbbb $0, %r10b +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload -; AVX512-NEXT: cmpq %rbx, %r11 +; AVX512-NEXT: cmpq %rbx, %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: movq %r14, %r10 +; AVX512-NEXT: movq %r14, %r11 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: sbbq %rax, %r10 -; AVX512-NEXT: setb %r10b -; AVX512-NEXT: cmpq %r11, %rbx +; AVX512-NEXT: sbbq %rax, %r11 +; AVX512-NEXT: setb %r11b +; AVX512-NEXT: cmpq %rcx, %rbx ; AVX512-NEXT: sbbq %r14, %rax -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: sbbb $0, %r10b -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload -; AVX512-NEXT: cmpq %r15, %r11 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx +; AVX512-NEXT: sbbb $0, %r11b +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload +; AVX512-NEXT: cmpq %r14, %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload ; AVX512-NEXT: movq %rax, %rbx -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: sbbq %r14, %rbx +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload +; AVX512-NEXT: sbbq %r15, %rbx ; AVX512-NEXT: setb %bl -; AVX512-NEXT: cmpq %r11, %r15 -; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11 -; AVX512-NEXT: sbbq %rax, %r14 +; AVX512-NEXT: cmpq %rcx, %r14 ; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14 +; AVX512-NEXT: sbbq %rax, %r15 +; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15 ; AVX512-NEXT: sbbb $0, %bl -; AVX512-NEXT: cmpq %r11, %r14 +; AVX512-NEXT: cmpq %r14, %r15 ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload -; AVX512-NEXT: movq %rax, %r15 +; AVX512-NEXT: movq %rax, %rcx ; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload -; AVX512-NEXT: sbbq %r12, %r15 -; AVX512-NEXT: setb %r15b -; AVX512-NEXT: cmpq %r14, %r11 +; AVX512-NEXT: sbbq %r12, %rcx +; AVX512-NEXT: setb %cl +; AVX512-NEXT: cmpq %r15, %r14 ; AVX512-NEXT: sbbq %rax, %r12 -; AVX512-NEXT: sbbb $0, %r15b -; AVX512-NEXT: movzbl %r15b, %r11d -; AVX512-NEXT: andl $3, %r11d -; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload -; AVX512-NEXT: movb %r11b, 4(%r14) -; AVX512-NEXT: movzbl %bl, %r11d +; AVX512-NEXT: sbbb $0, %cl +; AVX512-NEXT: movzbl %bl, %ebx +; AVX512-NEXT: andl $3, %ebx +; AVX512-NEXT: movzbl %r11b, %r11d ; AVX512-NEXT: andl $3, %r11d +; AVX512-NEXT: leaq (%r11,%rbx,4), %r11 ; AVX512-NEXT: movzbl %r10b, %r10d ; AVX512-NEXT: andl $3, %r10d -; AVX512-NEXT: leaq (%r10,%r11,4), %r10 +; AVX512-NEXT: shll $4, %r10d +; AVX512-NEXT: orq %r11, %r10 ; AVX512-NEXT: movzbl %r8b, %r8d ; AVX512-NEXT: andl $3, %r8d -; AVX512-NEXT: shll $4, %r8d +; AVX512-NEXT: shll $6, %r8d ; AVX512-NEXT: orq %r10, %r8 ; AVX512-NEXT: movzbl %dil, %edi ; AVX512-NEXT: andl $3, %edi -; AVX512-NEXT: shll $6, %edi +; AVX512-NEXT: shll $8, %edi ; AVX512-NEXT: orq %r8, %rdi ; AVX512-NEXT: movzbl %sil, %esi ; AVX512-NEXT: andl $3, %esi -; AVX512-NEXT: shll $8, %esi -; AVX512-NEXT: orq %rdi, %rsi +; AVX512-NEXT: shll $10, %esi ; AVX512-NEXT: movzbl %dl, %edx ; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shll $10, %edx -; AVX512-NEXT: movzbl %cl, %ecx -; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: shll $12, %ecx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: movzbl %r9b, %edx -; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shll $14, %edx -; AVX512-NEXT: orq %rcx, %rdx +; AVX512-NEXT: shll $12, %edx +; AVX512-NEXT: orq %rsi, %rdx +; AVX512-NEXT: movzbl %r9b, %esi +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: shll $14, %esi +; AVX512-NEXT: orq %rdx, %rsi ; AVX512-NEXT: movzbl %bpl, %eax ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: shll $16, %eax -; AVX512-NEXT: orq %rdx, %rax ; AVX512-NEXT: orq %rsi, %rax -; AVX512-NEXT: movzbl %r13b, %ecx -; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: shll $18, %ecx +; AVX512-NEXT: orq %rdi, %rax +; AVX512-NEXT: movzbl %r13b, %edx +; AVX512-NEXT: andl $3, %edx +; AVX512-NEXT: shll $18, %edx +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: shll $20, %esi +; AVX512-NEXT: orq %rdx, %rsi ; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload ; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shll $20, %edx -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: shll $22, %ecx -; AVX512-NEXT: orq %rdx, %rcx +; AVX512-NEXT: shll $22, %edx +; AVX512-NEXT: orq %rsi, %rdx +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: shll $24, %esi +; AVX512-NEXT: orq %rdx, %rsi ; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload ; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shll $24, %edx -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload -; AVX512-NEXT: andl $3, %ecx -; AVX512-NEXT: shlq $26, %rcx -; AVX512-NEXT: orq %rdx, %rcx -; AVX512-NEXT: orq %rax, %rcx +; AVX512-NEXT: shlq $26, %rdx +; AVX512-NEXT: orq %rsi, %rdx +; AVX512-NEXT: orq %rax, %rdx ; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload ; AVX512-NEXT: andl $3, %eax ; AVX512-NEXT: shlq $28, %rax -; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload -; AVX512-NEXT: andl $3, %edx -; AVX512-NEXT: shlq $30, %rdx -; AVX512-NEXT: orq %rax, %rdx -; AVX512-NEXT: orq %rcx, %rdx -; AVX512-NEXT: movq %r14, %rax -; AVX512-NEXT: movl %edx, (%r14) +; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload +; AVX512-NEXT: andl $3, %esi +; AVX512-NEXT: shlq $30, %rsi +; AVX512-NEXT: orq %rax, %rsi +; AVX512-NEXT: orq %rdx, %rsi +; AVX512-NEXT: andb $3, %cl +; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload +; AVX512-NEXT: movb %cl, 4(%rax) +; AVX512-NEXT: movl %esi, (%rax) ; AVX512-NEXT: addq $88, %rsp ; AVX512-NEXT: popq %rbx ; AVX512-NEXT: popq %r12 @@ -3198,22 +3195,22 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: sbbb $0, %bl ; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl %esi, %edi ; X86-NEXT: sbbl %edx, %edi -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %edi ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NEXT: sbbl %ebx, %edi ; X86-NEXT: movl $0, %edi ; X86-NEXT: sbbl %edi, %edi ; X86-NEXT: setb %cl -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) +; X86-NEXT: cmpl %ebp, {{[0-9]+}}(%esp) ; X86-NEXT: sbbl %esi, %edx -; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: sbbl %eax, %ebx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: sbbb $0, %cl @@ -3239,208 +3236,213 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %edx +; X86-NEXT: cmpl %eax, %edx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl %esi, %edi -; X86-NEXT: sbbl %edx, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl %edi, %ecx +; X86-NEXT: sbbl %esi, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %edi +; X86-NEXT: movl %ebp, %ecx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %edi -; X86-NEXT: movl $0, %edi -; X86-NEXT: sbbl %edi, %edi +; X86-NEXT: sbbl %ebx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ecx, %ecx ; X86-NEXT: setb %cl -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: cmpl %edx, %eax +; X86-NEXT: sbbl %edi, %esi ; X86-NEXT: sbbl %ebp, %ebx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: sbbb $0, %cl ; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %edx -; X86-NEXT: sbbl %esi, %edx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl %ebx, %edx +; X86-NEXT: sbbl %edi, %edx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NEXT: movl %ebp, %edx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %edx ; X86-NEXT: movl $0, %edx ; X86-NEXT: sbbl %edx, %edx ; X86-NEXT: setb %dl -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: cmpl %esi, %eax +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %ebp, %ecx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: sbbb $0, %dl ; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %eax, %ecx ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl %ebp, %eax +; X86-NEXT: sbbl %ebx, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: setb %bl -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %eax ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: sbbb $0, %bl -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: cmpl %eax, %ecx +; X86-NEXT: setb %al +; X86-NEXT: cmpl %edi, %esi +; X86-NEXT: sbbl %ebp, %ebx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %esi +; X86-NEXT: sbbl %esi, %esi +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %ebx -; X86-NEXT: sbbl %esi, %ebx -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: cmpl %esi, %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: sbbl %ebx, %ebp ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ebx -; X86-NEXT: movl $0, %ebx -; X86-NEXT: sbbl %ebx, %ebx -; X86-NEXT: setb %bl -; X86-NEXT: cmpl %ecx, %eax -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: sbbl %ebp, %edx +; X86-NEXT: movl %edx, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %ebp +; X86-NEXT: movl $0, %ebp +; X86-NEXT: sbbl %ebp, %ebp +; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill +; X86-NEXT: cmpl %edi, %esi +; X86-NEXT: sbbl %eax, %ebx +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: sbbb $0, %bl -; X86-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax +; X86-NEXT: sbbb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %edi, %ebp -; X86-NEXT: sbbl %esi, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx ; X86-NEXT: movl %ebx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ebp +; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NEXT: movl %eax, %ebp +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %ebp ; X86-NEXT: movl $0, %ebp ; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %edi, %esi -; X86-NEXT: sbbl %ebx, %edx +; X86-NEXT: setb %dl +; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %ebx, %edi +; X86-NEXT: sbbl %eax, %ecx ; X86-NEXT: movl $0, %eax ; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: sbbb $0, %cl -; X86-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: sbbb $0, %dl +; X86-NEXT: movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi +; X86-NEXT: cmpl %eax, %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: movl %ebp, %eax -; X86-NEXT: sbbl %edi, %eax -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NEXT: movl %ecx, %eax +; X86-NEXT: movl %ebp, %ebx +; X86-NEXT: sbbl %edi, %ebx ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edx, %eax -; X86-NEXT: movl $0, %eax -; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: movl %edx, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %ebx +; X86-NEXT: movl $0, %ebx +; X86-NEXT: sbbl %ebx, %ebx ; X86-NEXT: setb %bl -; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) +; X86-NEXT: cmpl %esi, %eax ; X86-NEXT: sbbl %ebp, %edi -; X86-NEXT: sbbl %ecx, %edx -; X86-NEXT: movl $0, %ecx -; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax ; X86-NEXT: sbbb $0, %bl -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi ; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NEXT: movl %ecx, %ebp -; X86-NEXT: sbbl %edi, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: movl %eax, %ebp +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NEXT: movl %esi, %eax +; X86-NEXT: sbbl %ebp, %eax ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: sbbl %edx, %ebp -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: setb %bh -; X86-NEXT: cmpl %esi, {{[0-9]+}}(%esp) -; X86-NEXT: sbbl %ecx, %edi -; X86-NEXT: sbbl %eax, %edx +; X86-NEXT: movl %edx, %eax +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NEXT: sbbl %ecx, %eax +; X86-NEXT: movl $0, %eax +; X86-NEXT: sbbl %eax, %eax +; X86-NEXT: setb %al +; X86-NEXT: cmpl %edi, {{[0-9]+}}(%esp) +; X86-NEXT: sbbl %esi, %ebp +; X86-NEXT: sbbl %edx, %ecx ; X86-NEXT: movl $0, %ecx ; X86-NEXT: sbbl %ecx, %ecx -; X86-NEXT: sbbb $0, %bh +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movzbl %al, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl %bl, %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: cmpl %eax, {{[0-9]+}}(%esp) -; X86-NEXT: movl {{[0-9]+}}(%esp), %esi -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movl %esi, %ebp -; X86-NEXT: sbbl %edi, %ebp +; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NEXT: sbbl %eax, %ecx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NEXT: movl %edx, %ebp -; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %eax, %ebp -; X86-NEXT: movl $0, %ebp -; X86-NEXT: sbbl %ebp, %ebp -; X86-NEXT: setb %cl -; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp -; X86-NEXT: sbbl %esi, %edi -; X86-NEXT: sbbl %edx, %eax +; X86-NEXT: sbbl %edx, %ecx +; X86-NEXT: movl $0, %ecx +; X86-NEXT: sbbl %ecx, %ecx +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: setb %al +; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NEXT: sbbl {{[0-9]+}}(%esp), %ebx +; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: sbbl %edx, %edx +; X86-NEXT: sbbb $0, %al +; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NEXT: sbbl %eax, %eax -; X86-NEXT: sbbb $0, %cl -; X86-NEXT: movzbl %cl, %ecx -; X86-NEXT: andl $3, %ecx -; X86-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NEXT: movb %cl, 4(%edi) -; X86-NEXT: movzbl %bh, %ebp -; X86-NEXT: movzbl %bl, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: andl $3, %ebp -; X86-NEXT: andl $3, %ecx -; X86-NEXT: leal (%ecx,%ebp,4), %ecx ; X86-NEXT: andl $3, %eax -; X86-NEXT: shll $4, %eax -; X86-NEXT: orl %ecx, %eax -; X86-NEXT: andl $3, %ebx -; X86-NEXT: shll $6, %ebx -; X86-NEXT: orl %eax, %ebx +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NEXT: andl $3, %edx +; X86-NEXT: leal (%edx,%eax,4), %edx +; X86-NEXT: andl $3, %edi +; X86-NEXT: shll $4, %edi +; X86-NEXT: orl %edx, %edi +; X86-NEXT: andl $3, %ebp +; X86-NEXT: shll $6, %ebp +; X86-NEXT: orl %edi, %ebp ; X86-NEXT: andl $3, %esi ; X86-NEXT: shll $8, %esi -; X86-NEXT: orl %ebx, %esi +; X86-NEXT: orl %ebp, %esi +; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NEXT: andl $3, %edx ; X86-NEXT: shll $10, %edx ; X86-NEXT: orl %esi, %edx +; X86-NEXT: andl $3, %ecx +; X86-NEXT: shll $12, %ecx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: andl $3, %eax -; X86-NEXT: shll $12, %eax +; X86-NEXT: shll $14, %eax +; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload ; X86-NEXT: andl $3, %ecx -; X86-NEXT: shll $14, %ecx +; X86-NEXT: shll $16, %ecx ; X86-NEXT: orl %eax, %ecx -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload -; X86-NEXT: andl $3, %eax -; X86-NEXT: shll $16, %eax -; X86-NEXT: orl %ecx, %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload ; X86-NEXT: andl $3, %esi ; X86-NEXT: shll $18, %esi -; X86-NEXT: orl %eax, %esi -; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload +; X86-NEXT: orl %ecx, %esi ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload ; X86-NEXT: andl $3, %eax ; X86-NEXT: shll $20, %eax ; X86-NEXT: orl %esi, %eax @@ -3449,22 +3451,24 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind { ; X86-NEXT: orl %edx, %eax ; X86-NEXT: andl $3, %ecx ; X86-NEXT: shll $22, %ecx +; X86-NEXT: andl $3, %edi +; X86-NEXT: shll $24, %edi +; X86-NEXT: orl %ecx, %edi ; X86-NEXT: andl $3, %esi -; X86-NEXT: shll $24, %esi -; X86-NEXT: orl %ecx, %esi +; X86-NEXT: shll $26, %esi +; X86-NEXT: orl %edi, %esi ; X86-NEXT: andl $3, %ebx -; X86-NEXT: shll $26, %ebx +; X86-NEXT: shll $28, %ebx ; X86-NEXT: orl %esi, %ebx ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-NEXT: andl $3, %ecx -; X86-NEXT: shll $28, %ecx +; X86-NEXT: shll $30, %ecx ; X86-NEXT: orl %ebx, %ecx +; X86-NEXT: orl %eax, %ecx +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload -; X86-NEXT: shll $30, %edx -; X86-NEXT: orl %ecx, %edx -; X86-NEXT: orl %eax, %edx -; X86-NEXT: movl %edx, (%edi) -; X86-NEXT: movl %edi, %eax +; X86-NEXT: andb $3, %dl +; X86-NEXT: movb %dl, 4(%eax) +; X86-NEXT: movl %ecx, (%eax) ; X86-NEXT: addl $132, %esp ; X86-NEXT: popl %esi ; X86-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll index 9a8719f9a64fa..cecc3efd86282 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll @@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) { define i8 @in8_constmask(i8 %x, i8 %y) { ; CHECK-NOBMI-LABEL: in8_constmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %edi -; CHECK-NOBMI-NEXT: andb $60, %dil -; CHECK-NOBMI-NEXT: xorb %dil, %al +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorb %sil, %al +; CHECK-NOBMI-NEXT: andb $60, %al +; CHECK-NOBMI-NEXT: xorb %sil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: in8_constmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: xorl %esi, %edi -; CHECK-BMI-NEXT: andb $60, %dil -; CHECK-BMI-NEXT: xorb %dil, %al +; CHECK-BMI-NEXT: movl %edi, %eax +; CHECK-BMI-NEXT: xorb %sil, %al +; CHECK-BMI-NEXT: andb $60, %al +; CHECK-BMI-NEXT: xorb %sil, %al ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %n0 = xor i8 %x, %y diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll index c4c4e5ed1fdde..2e2c152c5506a 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll @@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) { define i8 @in8_constmask(i8 %x, i8 %y) { ; CHECK-NOBMI-LABEL: in8_constmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %edi -; CHECK-NOBMI-NEXT: andb $85, %dil -; CHECK-NOBMI-NEXT: xorb %dil, %al +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorb %sil, %al +; CHECK-NOBMI-NEXT: andb $85, %al +; CHECK-NOBMI-NEXT: xorb %sil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: in8_constmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: xorl %esi, %edi -; CHECK-BMI-NEXT: andb $85, %dil -; CHECK-BMI-NEXT: xorb %dil, %al +; CHECK-BMI-NEXT: movl %edi, %eax +; CHECK-BMI-NEXT: xorb %sil, %al +; CHECK-BMI-NEXT: andb $85, %al +; CHECK-BMI-NEXT: xorb %sil, %al ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %n0 = xor i8 %x, %y diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll index 2ea74f3942387..074b6f8591d1e 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll @@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) { define i8 @in8_constmask(i8 %x, i8 %y) { ; CHECK-NOBMI-LABEL: in8_constmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %edi -; CHECK-NOBMI-NEXT: andb $15, %dil -; CHECK-NOBMI-NEXT: xorb %dil, %al +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorb %sil, %al +; CHECK-NOBMI-NEXT: andb $15, %al +; CHECK-NOBMI-NEXT: xorb %sil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: in8_constmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: xorl %esi, %edi -; CHECK-BMI-NEXT: andb $15, %dil -; CHECK-BMI-NEXT: xorb %dil, %al +; CHECK-BMI-NEXT: movl %edi, %eax +; CHECK-BMI-NEXT: xorb %sil, %al +; CHECK-BMI-NEXT: andb $15, %al +; CHECK-BMI-NEXT: xorb %sil, %al ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %n0 = xor i8 %x, %y diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll index eb6accd3e623b..a9cf67789288c 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll @@ -104,19 +104,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) { define i8 @in8_constmask(i8 %x, i8 %y) { ; CHECK-NOBMI-LABEL: in8_constmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %esi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %edi -; CHECK-NOBMI-NEXT: andb $15, %dil -; CHECK-NOBMI-NEXT: xorb %dil, %al +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorb %sil, %al +; CHECK-NOBMI-NEXT: andb $15, %al +; CHECK-NOBMI-NEXT: xorb %sil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: in8_constmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %esi, %eax -; CHECK-BMI-NEXT: xorl %esi, %edi -; CHECK-BMI-NEXT: andb $15, %dil -; CHECK-BMI-NEXT: xorb %dil, %al +; CHECK-BMI-NEXT: movl %edi, %eax +; CHECK-BMI-NEXT: xorb %sil, %al +; CHECK-BMI-NEXT: andb $15, %al +; CHECK-BMI-NEXT: xorb %sil, %al ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %n0 = xor i8 %x, %y diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll index 9c9d06921096c..6889fe1edff4b 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll @@ -6,18 +6,17 @@ define i8 @out8(i8 %x, i8 %y, i8 %mask) { ; CHECK-NOBMI-LABEL: out8: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edx, %eax -; CHECK-NOBMI-NEXT: andl %edx, %edi -; CHECK-NOBMI-NEXT: notb %al -; CHECK-NOBMI-NEXT: andb %sil, %al -; CHECK-NOBMI-NEXT: orb %dil, %al +; CHECK-NOBMI-NEXT: movl %edi, %eax +; CHECK-NOBMI-NEXT: xorb %sil, %al +; CHECK-NOBMI-NEXT: andb %dl, %al +; CHECK-NOBMI-NEXT: xorb %sil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out8: ; CHECK-BMI: # %bb.0: ; CHECK-BMI-NEXT: movl %edx, %eax -; CHECK-BMI-NEXT: andl %edx, %edi +; CHECK-BMI-NEXT: andb %al, %dil ; CHECK-BMI-NEXT: notb %al ; CHECK-BMI-NEXT: andb %sil, %al ; CHECK-BMI-NEXT: orb %dil, %al @@ -106,17 +105,18 @@ define i8 @in8(i8 %x, i8 %y, i8 %mask) { ; CHECK-NOBMI-LABEL: in8: ; CHECK-NOBMI: # %bb.0: ; CHECK-NOBMI-NEXT: movl %edi, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %eax -; CHECK-NOBMI-NEXT: andl %edx, %eax -; CHECK-NOBMI-NEXT: xorl %esi, %eax +; CHECK-NOBMI-NEXT: xorb %sil, %al +; CHECK-NOBMI-NEXT: andb %dl, %al +; CHECK-NOBMI-NEXT: xorb %sil, %al ; CHECK-NOBMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: in8: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: andnl %esi, %edx, %eax -; CHECK-BMI-NEXT: andl %edx, %edi -; CHECK-BMI-NEXT: orl %edi, %eax +; CHECK-BMI-NEXT: movl %edi, %eax +; CHECK-BMI-NEXT: xorb %sil, %al +; CHECK-BMI-NEXT: andb %dl, %al +; CHECK-BMI-NEXT: xorb %sil, %al ; CHECK-BMI-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BMI-NEXT: retq %n0 = xor i8 %x, %y diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll index b1194bedc4e1c..27fa5e68f1305 100644 --- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll @@ -16,11 +16,10 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: out_v1i8: ; CHECK: # %bb.0: -; CHECK-NEXT: movl %edx, %eax -; CHECK-NEXT: andl %edx, %edi -; CHECK-NEXT: notb %al -; CHECK-NEXT: andb %sil, %al -; CHECK-NEXT: orb %dil, %al +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: xorb %sil, %al +; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: xorb %sil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %mx = and <1 x i8> %x, %mask @@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-BASELINE-LABEL: out_v2i8: ; CHECK-BASELINE: # %bb.0: -; CHECK-BASELINE-NEXT: movl %r8d, %eax -; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %edi -; CHECK-BASELINE-NEXT: notb %al -; CHECK-BASELINE-NEXT: notb %r9b -; CHECK-BASELINE-NEXT: andb %cl, %r9b -; CHECK-BASELINE-NEXT: andb %dl, %al -; CHECK-BASELINE-NEXT: orb %dil, %al -; CHECK-BASELINE-NEXT: orb %sil, %r9b +; CHECK-BASELINE-NEXT: movl %edi, %eax +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: andb %r8b, %al +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: xorb %cl, %sil +; CHECK-BASELINE-NEXT: andb %r9b, %sil +; CHECK-BASELINE-NEXT: xorb %cl, %sil ; CHECK-BASELINE-NEXT: # kill: def $al killed $al killed $eax -; CHECK-BASELINE-NEXT: movl %r9d, %edx +; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq ; ; CHECK-SSE1-LABEL: out_v2i8: ; CHECK-SSE1: # %bb.0: -; CHECK-SSE1-NEXT: movl %r8d, %eax -; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %edi -; CHECK-SSE1-NEXT: notb %al -; CHECK-SSE1-NEXT: notb %r9b -; CHECK-SSE1-NEXT: andb %cl, %r9b -; CHECK-SSE1-NEXT: andb %dl, %al -; CHECK-SSE1-NEXT: orb %dil, %al -; CHECK-SSE1-NEXT: orb %sil, %r9b +; CHECK-SSE1-NEXT: movl %edi, %eax +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: andb %r8b, %al +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: xorb %cl, %sil +; CHECK-SSE1-NEXT: andb %r9b, %sil +; CHECK-SSE1-NEXT: xorb %cl, %sil ; CHECK-SSE1-NEXT: # kill: def $al killed $al killed $eax -; CHECK-SSE1-NEXT: movl %r9d, %edx +; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq ; ; CHECK-SSE2-LABEL: out_v2i8: @@ -111,7 +106,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: xorb %r9b, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: xorb %r9b, %sil ; CHECK-BASELINE-NEXT: xorb %r11b, %dl @@ -135,7 +130,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: xorb %r9b, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: xorb %r9b, %sil ; CHECK-SSE1-NEXT: xorb %r11b, %dl @@ -178,7 +173,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: xorb %r9b, %sil ; CHECK-BASELINE-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-BASELINE-NEXT: xorb %r9b, %sil ; CHECK-BASELINE-NEXT: xorb %r10b, %dl @@ -199,7 +194,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %cl -; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: xorb %r9b, %sil ; CHECK-SSE1-NEXT: andb {{[0-9]+}}(%rsp), %sil ; CHECK-SSE1-NEXT: xorb %r9b, %sil ; CHECK-SSE1-NEXT: xorb %r10b, %dl @@ -2277,9 +2272,9 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind { ; CHECK-LABEL: in_v1i8: ; CHECK: # %bb.0: ; CHECK-NEXT: movl %edi, %eax -; CHECK-NEXT: xorl %esi, %eax -; CHECK-NEXT: andl %edx, %eax -; CHECK-NEXT: xorl %esi, %eax +; CHECK-NEXT: xorb %sil, %al +; CHECK-NEXT: andb %dl, %al +; CHECK-NEXT: xorb %sil, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retq %n0 = xor <1 x i8> %x, %y @@ -2296,12 +2291,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-BASELINE-LABEL: in_v2i8: ; CHECK-BASELINE: # %bb.0: ; CHECK-BASELINE-NEXT: movl %edi, %eax -; CHECK-BASELINE-NEXT: xorl %edx, %eax -; CHECK-BASELINE-NEXT: xorl %ecx, %esi -; CHECK-BASELINE-NEXT: andl %r9d, %esi -; CHECK-BASELINE-NEXT: andl %r8d, %eax -; CHECK-BASELINE-NEXT: xorl %edx, %eax -; CHECK-BASELINE-NEXT: xorl %ecx, %esi +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: xorb %cl, %sil +; CHECK-BASELINE-NEXT: andb %r9b, %sil +; CHECK-BASELINE-NEXT: andb %r8b, %al +; CHECK-BASELINE-NEXT: xorb %dl, %al +; CHECK-BASELINE-NEXT: xorb %cl, %sil ; CHECK-BASELINE-NEXT: # kill: def $al killed $al killed $eax ; CHECK-BASELINE-NEXT: movl %esi, %edx ; CHECK-BASELINE-NEXT: retq @@ -2309,12 +2304,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind { ; CHECK-SSE1-LABEL: in_v2i8: ; CHECK-SSE1: # %bb.0: ; CHECK-SSE1-NEXT: movl %edi, %eax -; CHECK-SSE1-NEXT: xorl %edx, %eax -; CHECK-SSE1-NEXT: xorl %ecx, %esi -; CHECK-SSE1-NEXT: andl %r9d, %esi -; CHECK-SSE1-NEXT: andl %r8d, %eax -; CHECK-SSE1-NEXT: xorl %edx, %eax -; CHECK-SSE1-NEXT: xorl %ecx, %esi +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: xorb %cl, %sil +; CHECK-SSE1-NEXT: andb %r9b, %sil +; CHECK-SSE1-NEXT: andb %r8b, %al +; CHECK-SSE1-NEXT: xorb %dl, %al +; CHECK-SSE1-NEXT: xorb %cl, %sil ; CHECK-SSE1-NEXT: # kill: def $al killed $al killed $eax ; CHECK-SSE1-NEXT: movl %esi, %edx ; CHECK-SSE1-NEXT: retq @@ -2362,7 +2357,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-BASELINE-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-BASELINE-NEXT: xorl %r9d, %esi +; CHECK-BASELINE-NEXT: xorb %r9b, %sil ; CHECK-BASELINE-NEXT: xorb %r11b, %dl ; CHECK-BASELINE-NEXT: xorb %r10b, %cl ; CHECK-BASELINE-NEXT: xorb %dil, %r8b @@ -2386,7 +2381,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind { ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %edi ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r10d ; CHECK-SSE1-NEXT: movzbl {{[0-9]+}}(%rsp), %r11d -; CHECK-SSE1-NEXT: xorl %r9d, %esi +; CHECK-SSE1-NEXT: xorb %r9b, %sil ; CHECK-SSE1-NEXT: xorb %r11b, %dl ; CHECK-SSE1-NEXT: xorb %r10b, %cl ; CHECK-SSE1-NEXT: xorb %dil, %r8b diff --git a/llvm/test/CodeGen/X86/urem-i8-constant.ll b/llvm/test/CodeGen/X86/urem-i8-constant.ll index ae218405c0ef0..a2a0367cc1b0f 100644 --- a/llvm/test/CodeGen/X86/urem-i8-constant.ll +++ b/llvm/test/CodeGen/X86/urem-i8-constant.ll @@ -10,8 +10,9 @@ define i8 @foo(i8 %tmp325) { ; CHECK-NEXT: imull $111, %eax, %ecx ; CHECK-NEXT: shrl $12, %ecx ; CHECK-NEXT: leal (%ecx,%ecx,8), %edx -; CHECK-NEXT: leal (%ecx,%edx,4), %ecx -; CHECK-NEXT: subb %cl, %al +; CHECK-NEXT: shll $2, %edx +; CHECK-NEXT: addb %cl, %dl +; CHECK-NEXT: subb %dl, %al ; CHECK-NEXT: # kill: def $al killed $al killed $eax ; CHECK-NEXT: retl %t546 = urem i8 %tmp325, 37 diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll index 97cc1f8a15694..4f3114ce43127 100644 --- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll +++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll @@ -64,9 +64,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; X86: # %bb.0: ; X86-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NEXT: leal (%eax,%eax,2), %ecx -; X86-NEXT: leal (%eax,%ecx,4), %eax -; X86-NEXT: andb $15, %al -; X86-NEXT: cmpb $4, %al +; X86-NEXT: shll $2, %ecx +; X86-NEXT: addb %al, %cl +; X86-NEXT: andb $15, %cl +; X86-NEXT: cmpb $4, %cl ; X86-NEXT: setae %al ; X86-NEXT: retl ; @@ -74,9 +75,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind { ; X64: # %bb.0: ; X64-NEXT: # kill: def $edi killed $edi def $rdi ; X64-NEXT: leal (%rdi,%rdi,2), %eax -; X64-NEXT: leal (%rdi,%rax,4), %eax -; X64-NEXT: andb $15, %al -; X64-NEXT: cmpb $4, %al +; X64-NEXT: shll $2, %eax +; X64-NEXT: addb %al, %dil +; X64-NEXT: andb $15, %dil +; X64-NEXT: cmpb $4, %dil ; X64-NEXT: setae %al ; X64-NEXT: retq %urem = urem i4 %X, 5 diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll index bf027a7346deb..eb8c6af81ba41 100644 --- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll @@ -1355,10 +1355,8 @@ define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) { ; SSE2-NEXT: pxor %xmm4, %xmm0 ; SSE2-NEXT: pcmpgtd %xmm2, %xmm0 ; SSE2-NEXT: packssdw %xmm1, %xmm0 -; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: notl %eax -; SSE2-NEXT: cmpb $-1, %al +; SSE2-NEXT: testl %eax, %eax ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll index 2df39d69dbb75..8f441fee67084 100644 --- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll +++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll @@ -1195,7 +1195,7 @@ define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) { ; SSE2-NEXT: packssdw %xmm1, %xmm0 ; SSE2-NEXT: packsswb %xmm0, %xmm0 ; SSE2-NEXT: pmovmskb %xmm0, %eax -; SSE2-NEXT: xorb $-1, %al +; SSE2-NEXT: cmpb $-1, %al ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll index 9cd0f4d12e15a..4d0d0548d44f9 100644 --- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll @@ -1068,10 +1068,10 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; SSE2-NEXT: movd %xmm1, %eax ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] ; SSE2-NEXT: movd %xmm0, %ecx -; SSE2-NEXT: orl %eax, %ecx +; SSE2-NEXT: orb %al, %cl ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3] ; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: orl %ecx, %eax +; SSE2-NEXT: orb %cl, %al ; SSE2-NEXT: testb $1, %al ; SSE2-NEXT: je .LBB27_2 ; SSE2-NEXT: # %bb.1: @@ -1088,9 +1088,9 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; SSE41-NEXT: pxor %xmm0, %xmm1 ; SSE41-NEXT: pextrd $1, %xmm1, %eax ; SSE41-NEXT: movd %xmm1, %ecx -; SSE41-NEXT: orl %eax, %ecx +; SSE41-NEXT: orb %al, %cl ; SSE41-NEXT: pextrd $2, %xmm1, %eax -; SSE41-NEXT: orl %ecx, %eax +; SSE41-NEXT: orb %cl, %al ; SSE41-NEXT: testb $1, %al ; SSE41-NEXT: je .LBB27_2 ; SSE41-NEXT: # %bb.1: @@ -1107,9 +1107,9 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) { ; AVX1OR2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1OR2-NEXT: vpextrd $1, %xmm0, %eax ; AVX1OR2-NEXT: vmovd %xmm0, %ecx -; AVX1OR2-NEXT: orl %eax, %ecx +; AVX1OR2-NEXT: orb %al, %cl ; AVX1OR2-NEXT: vpextrd $2, %xmm0, %eax -; AVX1OR2-NEXT: orl %ecx, %eax +; AVX1OR2-NEXT: orb %cl, %al ; AVX1OR2-NEXT: testb $1, %al ; AVX1OR2-NEXT: je .LBB27_2 ; AVX1OR2-NEXT: # %bb.1: diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll index 3c98eba69ae5b..1b36fa7d64ca7 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll @@ -12065,7 +12065,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rdi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %edi ; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 @@ -12073,6 +12072,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: andb $56, %sil ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 @@ -12167,7 +12167,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rax,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %eax ; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi @@ -12234,54 +12233,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx ; FALLBACK2-NEXT: andl $56, %eax ; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 ; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi ; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbx ; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp +; FALLBACK2-NEXT: shrxq %rcx, %r11, %rbp ; FALLBACK2-NEXT: movl %ecx, %r12d +; FALLBACK2-NEXT: andb $56, %r12b ; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 +; FALLBACK2-NEXT: addq %r10, %r10 +; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi ; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK2-NEXT: shrxq %rcx, %r9, %r13 ; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK2-NEXT: shrxq %rcx, %rax, %rcx -; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %r12, %r11, %r11 +; FALLBACK2-NEXT: orq %r8, %r11 ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK2-NEXT: orq %rbx, %rsi +; FALLBACK2-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 ; FALLBACK2-NEXT: orq %r15, %r8 ; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 +; FALLBACK2-NEXT: shlxq %r12, %r14, %r9 +; FALLBACK2-NEXT: orq %rbp, %r9 ; FALLBACK2-NEXT: addq %rax, %rax ; FALLBACK2-NEXT: shlxq %r12, %rax, %rax ; FALLBACK2-NEXT: orq %r13, %rax ; FALLBACK2-NEXT: movq %rcx, 56(%rdx) ; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r9, 32(%rdx) ; FALLBACK2-NEXT: movq %r8, 40(%rdx) ; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) +; FALLBACK2-NEXT: movq %r11, 24(%rdx) ; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 @@ -12319,7 +12318,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rax,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %eax ; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi @@ -12338,17 +12336,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 ; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax -; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK3-NEXT: shrxq %rcx, %rax, %rax ; FALLBACK3-NEXT: movq %r11, 48(%rdx) +; FALLBACK3-NEXT: movq %rax, 56(%rdx) ; FALLBACK3-NEXT: movq %r10, 32(%rdx) ; FALLBACK3-NEXT: movq %r15, 40(%rdx) ; FALLBACK3-NEXT: movq %rdi, 16(%rdx) ; FALLBACK3-NEXT: movq %rbx, 24(%rdx) ; FALLBACK3-NEXT: movq %rsi, (%rdx) ; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: popq %r15 @@ -12378,13 +12375,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%r8,8), %eax -; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %r8d ; FALLBACK4-NEXT: movq -128(%rsp,%r8), %r10 ; FALLBACK4-NEXT: movq -120(%rsp,%r8), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: andb $56, %sil ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r9,%r9), %rdi ; FALLBACK4-NEXT: movl %esi, %ecx @@ -12474,7 +12471,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx ; FALLBACK5-NEXT: andl $56, %eax ; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 @@ -12534,9 +12530,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %rbx ; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 @@ -12545,34 +12540,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl +; FALLBACK6-NEXT: movl %esi, %r11d +; FALLBACK6-NEXT: andb $56, %r11b +; FALLBACK6-NEXT: notb %r11b ; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shlxq %r11, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: leaq (%r13,%r13), %rbx +; FALLBACK6-NEXT: shlxq %r11, %rbx, %rbx +; FALLBACK6-NEXT: orq %r12, %rbx ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK6-NEXT: shrxq %rsi, %rax, %rsi ; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK6-NEXT: shlxq %r11, %rdi, %rdi ; FALLBACK6-NEXT: orq %r9, %rdi ; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK6-NEXT: shlxq %r11, %r9, %r9 ; FALLBACK6-NEXT: orq %r14, %r9 ; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK6-NEXT: shlxq %r11, %r10, %r10 ; FALLBACK6-NEXT: orq %r15, %r10 ; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK6-NEXT: shlxq %r11, %rax, %rax ; FALLBACK6-NEXT: orq %r13, %rax ; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK6-NEXT: shlxq %r11, %rcx, %rcx ; FALLBACK6-NEXT: orq %rbp, %rcx ; FALLBACK6-NEXT: movq %rsi, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) @@ -12580,7 +12576,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %r10, 32(%rdx) ; FALLBACK6-NEXT: movq %r9, 40(%rdx) ; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %rbx, 24(%rdx) ; FALLBACK6-NEXT: movq %r8, (%rdx) ; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx @@ -12611,7 +12607,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx ; FALLBACK7-NEXT: andl $56, %eax ; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 @@ -12631,17 +12626,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK7-NEXT: movq %rax, %r15 ; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK7-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK7-NEXT: shrxq %rcx, %r11, %rax ; FALLBACK7-NEXT: movq %r15, 8(%rdx) ; FALLBACK7-NEXT: movq %r9, 48(%rdx) +; FALLBACK7-NEXT: movq %rax, 56(%rdx) ; FALLBACK7-NEXT: movq %rdi, 32(%rdx) ; FALLBACK7-NEXT: movq %rbx, 40(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rsi, 24(%rdx) ; FALLBACK7-NEXT: movq %r14, (%rdx) -; FALLBACK7-NEXT: movq %r10, 56(%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 ; FALLBACK7-NEXT: popq %r15 @@ -12665,13 +12659,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%r9,8), %eax -; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %r9d ; FALLBACK8-NEXT: movq -128(%rsp,%r9), %r10 ; FALLBACK8-NEXT: movq -120(%rsp,%r9), %r8 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: andb $56, %sil ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK8-NEXT: movl %esi, %ecx @@ -12749,43 +12743,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: pushq %rbx ; FALLBACK9-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK9-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK9-NEXT: movl (%rsi), %eax +; FALLBACK9-NEXT: movl (%rsi), %edi ; FALLBACK9-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx -; FALLBACK9-NEXT: andl $56, %eax -; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq %r9, %rsi -; FALLBACK9-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK9-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK9-NEXT: leal (,%rdi,8), %ecx +; FALLBACK9-NEXT: andl $56, %edi +; FALLBACK9-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK9-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq %r9, %rax +; FALLBACK9-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK9-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK9-NEXT: movq %r10, %r8 ; FALLBACK9-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK9-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK9-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK9-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK9-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: movq %r11, %rbx ; FALLBACK9-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK9-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK9-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK9-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK9-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK9-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK9-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK9-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK9-NEXT: movq %rax, %r15 +; FALLBACK9-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK9-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK9-NEXT: movq %rdi, %r15 ; FALLBACK9-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK9-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK9-NEXT: shrdq %cl, %rdi, %r14 ; FALLBACK9-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK9-NEXT: shrq %cl, %r11 ; FALLBACK9-NEXT: movq %r15, 8(%rdx) ; FALLBACK9-NEXT: movq %r9, 48(%rdx) ; FALLBACK9-NEXT: movq %r11, 56(%rdx) -; FALLBACK9-NEXT: movq %rdi, 32(%rdx) +; FALLBACK9-NEXT: movq %rsi, 32(%rdx) ; FALLBACK9-NEXT: movq %rbx, 40(%rdx) ; FALLBACK9-NEXT: movq %r8, 16(%rdx) -; FALLBACK9-NEXT: movq %rsi, 24(%rdx) +; FALLBACK9-NEXT: movq %rax, 24(%rdx) ; FALLBACK9-NEXT: movq %r14, (%rdx) ; FALLBACK9-NEXT: popq %rbx ; FALLBACK9-NEXT: popq %r14 @@ -12811,9 +12804,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %rbx ; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 @@ -12822,34 +12814,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl +; FALLBACK10-NEXT: movl %esi, %r11d +; FALLBACK10-NEXT: andb $56, %r11b +; FALLBACK10-NEXT: notb %r11b ; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: shlxq %r11, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: leaq (%r13,%r13), %rbx +; FALLBACK10-NEXT: shlxq %r11, %rbx, %rbx +; FALLBACK10-NEXT: orq %r12, %rbx ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK10-NEXT: shrxq %rsi, %rax, %rsi ; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK10-NEXT: shlxq %r11, %rdi, %rdi ; FALLBACK10-NEXT: orq %r9, %rdi ; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK10-NEXT: shlxq %r11, %r9, %r9 ; FALLBACK10-NEXT: orq %r14, %r9 ; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK10-NEXT: shlxq %r11, %r10, %r10 ; FALLBACK10-NEXT: orq %r15, %r10 ; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK10-NEXT: shlxq %r11, %rax, %rax ; FALLBACK10-NEXT: orq %r13, %rax ; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK10-NEXT: shlxq %r11, %rcx, %rcx ; FALLBACK10-NEXT: orq %rbp, %rcx ; FALLBACK10-NEXT: movq %rsi, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) @@ -12857,7 +12850,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %r10, 32(%rdx) ; FALLBACK10-NEXT: movq %r9, 40(%rdx) ; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %rbx, 24(%rdx) ; FALLBACK10-NEXT: movq %r8, (%rdx) ; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx @@ -12876,44 +12869,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: pushq %rbx ; FALLBACK11-NEXT: vmovups (%rdi), %ymm0 ; FALLBACK11-NEXT: vmovups 32(%rdi), %ymm1 -; FALLBACK11-NEXT: movl (%rsi), %eax +; FALLBACK11-NEXT: movl (%rsi), %edi ; FALLBACK11-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) -; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx -; FALLBACK11-NEXT: andl $56, %eax -; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq %r9, %rsi -; FALLBACK11-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK11-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK11-NEXT: leal (,%rdi,8), %ecx +; FALLBACK11-NEXT: andl $56, %edi +; FALLBACK11-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK11-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK11-NEXT: movq %r9, %rax +; FALLBACK11-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK11-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK11-NEXT: movq %r10, %r8 ; FALLBACK11-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK11-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK11-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK11-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK11-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK11-NEXT: movq %r11, %rbx ; FALLBACK11-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK11-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK11-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK11-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK11-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK11-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK11-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK11-NEXT: movq %rax, %r15 +; FALLBACK11-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK11-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK11-NEXT: movq %rdi, %r15 ; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK11-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK11-NEXT: shrdq %cl, %rdi, %r14 +; FALLBACK11-NEXT: shrxq %rcx, %r11, %rcx ; FALLBACK11-NEXT: movq %r15, 8(%rdx) ; FALLBACK11-NEXT: movq %r9, 48(%rdx) -; FALLBACK11-NEXT: movq %rdi, 32(%rdx) +; FALLBACK11-NEXT: movq %rcx, 56(%rdx) +; FALLBACK11-NEXT: movq %rsi, 32(%rdx) ; FALLBACK11-NEXT: movq %rbx, 40(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) -; FALLBACK11-NEXT: movq %rsi, 24(%rdx) +; FALLBACK11-NEXT: movq %rax, 24(%rdx) ; FALLBACK11-NEXT: movq %r14, (%rdx) -; FALLBACK11-NEXT: movq %r10, 56(%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 ; FALLBACK11-NEXT: popq %r15 @@ -12935,13 +12926,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: leal (,%r9,8), %eax -; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %r9d ; FALLBACK12-NEXT: movq -128(%rsp,%r9), %r10 ; FALLBACK12-NEXT: movq -120(%rsp,%r9), %r8 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: andb $56, %sil ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r8,%r8), %rdi ; FALLBACK12-NEXT: movl %esi, %ecx @@ -13023,7 +13014,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: leal (,%rdi,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx ; FALLBACK13-NEXT: andl $56, %edi ; FALLBACK13-NEXT: movq -96(%rsp,%rdi), %rsi ; FALLBACK13-NEXT: movq -104(%rsp,%rdi), %r9 @@ -13075,9 +13065,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: leal (,%rsi,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx ; FALLBACK14-NEXT: andl $56, %esi -; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %r11 +; FALLBACK14-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rbx ; FALLBACK14-NEXT: movq -112(%rsp,%rsi), %rax ; FALLBACK14-NEXT: movq -104(%rsp,%rsi), %rdi ; FALLBACK14-NEXT: shrxq %rcx, %rdi, %r12 @@ -13086,34 +13075,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq -88(%rsp,%rsi), %r10 ; FALLBACK14-NEXT: shrxq %rcx, %r10, %r14 ; FALLBACK14-NEXT: shrxq %rcx, %r13, %r15 -; FALLBACK14-NEXT: movl %ecx, %ebx -; FALLBACK14-NEXT: notb %bl +; FALLBACK14-NEXT: movl %ecx, %r11d +; FALLBACK14-NEXT: andb $56, %r11b +; FALLBACK14-NEXT: notb %r11b ; FALLBACK14-NEXT: movq -120(%rsp,%rsi), %rbp ; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shlxq %r11, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: leaq (%r13,%r13), %rbx +; FALLBACK14-NEXT: shlxq %r11, %rbx, %rbx +; FALLBACK14-NEXT: orq %r12, %rbx ; FALLBACK14-NEXT: movq -80(%rsp,%rsi), %r12 ; FALLBACK14-NEXT: shrxq %rcx, %r12, %r13 ; FALLBACK14-NEXT: shrxq %rcx, %rbp, %rbp ; FALLBACK14-NEXT: movq -72(%rsp,%rsi), %rsi ; FALLBACK14-NEXT: shrxq %rcx, %rsi, %rcx ; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK14-NEXT: shlxq %r11, %rdi, %rdi ; FALLBACK14-NEXT: orq %r9, %rdi ; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK14-NEXT: shlxq %r11, %r9, %r9 ; FALLBACK14-NEXT: orq %r14, %r9 ; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK14-NEXT: shlxq %r11, %r10, %r10 ; FALLBACK14-NEXT: orq %r15, %r10 ; FALLBACK14-NEXT: addq %rsi, %rsi -; FALLBACK14-NEXT: shlxq %rbx, %rsi, %rsi +; FALLBACK14-NEXT: shlxq %r11, %rsi, %rsi ; FALLBACK14-NEXT: orq %r13, %rsi ; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK14-NEXT: shlxq %r11, %rax, %rax ; FALLBACK14-NEXT: orq %rbp, %rax ; FALLBACK14-NEXT: movq %rcx, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 8(%rdx) @@ -13121,7 +13111,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %r10, 32(%rdx) ; FALLBACK14-NEXT: movq %r9, 40(%rdx) ; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %rbx, 24(%rdx) ; FALLBACK14-NEXT: movq %r8, (%rdx) ; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx @@ -13139,42 +13129,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: pushq %r14 ; FALLBACK15-NEXT: pushq %rbx ; FALLBACK15-NEXT: vmovups (%rdi), %zmm0 -; FALLBACK15-NEXT: movl (%rsi), %eax +; FALLBACK15-NEXT: movl (%rsi), %edi ; FALLBACK15-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) -; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx -; FALLBACK15-NEXT: andl $56, %eax -; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi -; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq %r9, %rsi -; FALLBACK15-NEXT: shrdq %cl, %rdi, %rsi -; FALLBACK15-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK15-NEXT: leal (,%rdi,8), %ecx +; FALLBACK15-NEXT: andl $56, %edi +; FALLBACK15-NEXT: movq -96(%rsp,%rdi), %rsi +; FALLBACK15-NEXT: movq -104(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq %r9, %rax +; FALLBACK15-NEXT: shrdq %cl, %rsi, %rax +; FALLBACK15-NEXT: movq -112(%rsp,%rdi), %r10 ; FALLBACK15-NEXT: movq %r10, %r8 ; FALLBACK15-NEXT: shrdq %cl, %r9, %r8 -; FALLBACK15-NEXT: movq -80(%rsp,%rax), %r9 -; FALLBACK15-NEXT: movq -88(%rsp,%rax), %r11 +; FALLBACK15-NEXT: movq -80(%rsp,%rdi), %r9 +; FALLBACK15-NEXT: movq -88(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: movq %r11, %rbx ; FALLBACK15-NEXT: shrdq %cl, %r9, %rbx -; FALLBACK15-NEXT: shrdq %cl, %r11, %rdi -; FALLBACK15-NEXT: movq -72(%rsp,%rax), %r11 +; FALLBACK15-NEXT: shrdq %cl, %r11, %rsi +; FALLBACK15-NEXT: movq -72(%rsp,%rdi), %r11 ; FALLBACK15-NEXT: shrdq %cl, %r11, %r9 -; FALLBACK15-NEXT: movq -128(%rsp,%rax), %r14 -; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax -; FALLBACK15-NEXT: movq %rax, %r15 +; FALLBACK15-NEXT: movq -128(%rsp,%rdi), %r14 +; FALLBACK15-NEXT: movq -120(%rsp,%rdi), %rdi +; FALLBACK15-NEXT: movq %rdi, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK15-NEXT: shrxq %rcx, %r11, %r10 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx -; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: shrdq %cl, %rdi, %r14 +; FALLBACK15-NEXT: shrxq %rcx, %r11, %rcx ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) -; FALLBACK15-NEXT: movq %rdi, 32(%rdx) +; FALLBACK15-NEXT: movq %rcx, 56(%rdx) +; FALLBACK15-NEXT: movq %rsi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) -; FALLBACK15-NEXT: movq %rsi, 24(%rdx) +; FALLBACK15-NEXT: movq %rax, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) -; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 ; FALLBACK15-NEXT: popq %r15 @@ -13209,198 +13197,202 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %eax +; FALLBACK16-NEXT: movl 40(%eax), %ebx +; FALLBACK16-NEXT: movl 44(%eax), %edi +; FALLBACK16-NEXT: movl 48(%eax), %esi +; FALLBACK16-NEXT: movl 52(%eax), %edx +; FALLBACK16-NEXT: movl 56(%eax), %ecx +; FALLBACK16-NEXT: movl 60(%eax), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK16-NEXT: movl (%ebp), %ebp ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %eax -; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %edi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movl %ebp, %ecx +; FALLBACK16-NEXT: andl $60, %ecx +; FALLBACK16-NEXT: movl 68(%esp,%ecx), %edi +; FALLBACK16-NEXT: movl %ecx, %ebx +; FALLBACK16-NEXT: shll $3, %edx +; FALLBACK16-NEXT: movl %edi, %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 72(%esp,%ebx), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK16-NEXT: movb %al, %ch +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK16-NEXT: movb %dl, %ch +; FALLBACK16-NEXT: andb $24, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %edi, %edx +; FALLBACK16-NEXT: shll %cl, %edi +; FALLBACK16-NEXT: orl %eax, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %esi +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK16-NEXT: movl %edx, %ebp -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movl 84(%esp,%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %edx ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx +; FALLBACK16-NEXT: orl %eax, %edx ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %eax, %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl 92(%esp,%eax), %ebp +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi ; FALLBACK16-NEXT: leal (%edi,%edi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax +; FALLBACK16-NEXT: orl %edx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl %ebx, %eax +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %eax, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %eax +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %esi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movl 100(%esp,%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl 104(%esp,%ebx), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %eax ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %ebp, %eax +; FALLBACK16-NEXT: orl %edx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 108(%esp,%ebx), %edi +; FALLBACK16-NEXT: movl %edi, %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 112(%esp,%ebx), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %ebp, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK16-NEXT: movl %esi, %ebx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shrl %cl, %ebx -; FALLBACK16-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK16-NEXT: leal (%eax,%eax), %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: orl %esi, %edi +; FALLBACK16-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK16-NEXT: movl %esi, %eax ; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl 120(%esp,%ebx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp +; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movb %dl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK16-NEXT: orl %edx, %esi +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %ebx, %edx +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movl 124(%esp,%ebx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edx, %ebp +; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %ebx, 60(%eax) -; FALLBACK16-NEXT: movl %edx, 56(%eax) +; FALLBACK16-NEXT: movl %ebp, 56(%eax) ; FALLBACK16-NEXT: movl %esi, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 52(%eax) ; FALLBACK16-NEXT: movl %edi, 40(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 44(%eax) @@ -13498,91 +13490,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %ecx, %eax +; FALLBACK17-NEXT: andl $60, %eax +; FALLBACK17-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK17-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi +; FALLBACK17-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK17-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %edi +; FALLBACK17-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %edi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK17-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %edi, %esi ; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: movl %ebx, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %ebx +; FALLBACK17-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK17-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %ebx +; FALLBACK17-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK17-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl %esi, 56(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: shrl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: shrl %cl, %edx +; FALLBACK17-NEXT: movl %edx, 60(%eax) +; FALLBACK17-NEXT: movl %edi, 48(%eax) +; FALLBACK17-NEXT: movl %ebx, 52(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 40(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 44(%eax) +; FALLBACK17-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 32(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 36(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 24(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 28(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 16(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 20(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 8(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 12(%eax) +; FALLBACK17-NEXT: movl %ebp, (%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 4(%eax) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -13659,13 +13650,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx -; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: shrxl %edx, %esi, %edi ; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: andb $24, %bl ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: leal (%eax,%eax), %ebp ; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax @@ -13866,7 +13857,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shll $3, %ecx -; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: shrdl %cl, %edx, %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi @@ -13963,7 +13953,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK20-NEXT: movl (%eax), %eax +; FALLBACK20-NEXT: movl (%eax), %ecx ; FALLBACK20-NEXT: xorps %xmm4, %xmm4 ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -13973,106 +13963,110 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi +; FALLBACK20-NEXT: movl %ecx, %esi ; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi -; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl 68(%esp,%esi), %ebx +; FALLBACK20-NEXT: shll $3, %ecx +; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl 72(%esp,%esi), %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%eax,%eax), %ebp +; FALLBACK20-NEXT: movl %ecx, %edx +; FALLBACK20-NEXT: movl %ecx, %eax ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: andb $24, %dl +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl 76(%esp,%esi), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl 80(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx +; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: addl %ebx, %ebx -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %edi, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 88(%esp,%esi), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 96(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: movl 104(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebp, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: addl %edi, %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK20-NEXT: movl %edi, %ebp ; FALLBACK20-NEXT: movl %eax, %ecx @@ -14080,52 +14074,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: orl %ebp, %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %edi, %edi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx +; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: movl %esi, %ebp ; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: movl 120(%esp,%ebp), %ebp +; FALLBACK20-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %eax +; FALLBACK20-NEXT: orl %ebx, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: addl %esi, %esi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl %ebp, %eax ; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %eax, %edx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl 124(%esp,%ecx), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %eax, %ebp ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK20-NEXT: shrl %cl, %ebx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %ebx, 60(%eax) -; FALLBACK20-NEXT: movl %edx, 56(%eax) +; FALLBACK20-NEXT: movl %ebp, 56(%eax) ; FALLBACK20-NEXT: movl %esi, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 52(%eax) ; FALLBACK20-NEXT: movl %edi, 40(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 44(%eax) @@ -14179,91 +14176,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: shrdl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %esi -; FALLBACK21-NEXT: shrdl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %ecx, %eax +; FALLBACK21-NEXT: andl $60, %eax +; FALLBACK21-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK21-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi +; FALLBACK21-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK21-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %edi +; FALLBACK21-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: shrdl %cl, %edx, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK21-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK21-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi ; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK21-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %edi, %esi ; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK21-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %ebx +; FALLBACK21-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %edx, %esi +; FALLBACK21-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK21-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %ebx +; FALLBACK21-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK21-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK21-NEXT: shrdl %cl, %edx, %esi +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl %esi, 56(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: shrl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) +; FALLBACK21-NEXT: shrl %cl, %edx +; FALLBACK21-NEXT: movl %edx, 60(%eax) +; FALLBACK21-NEXT: movl %edi, 48(%eax) +; FALLBACK21-NEXT: movl %ebx, 52(%eax) +; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 40(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 44(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 32(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 36(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 24(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 28(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 16(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 20(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 8(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 12(%eax) +; FALLBACK21-NEXT: movl %ebp, (%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 4(%eax) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -14284,7 +14280,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK22-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK22-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK22-NEXT: movl (%eax), %ecx +; FALLBACK22-NEXT: movl (%eax), %edx ; FALLBACK22-NEXT: xorps %xmm4, %xmm4 ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -14294,112 +14290,115 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK22-NEXT: leal (,%ecx,8), %edx -; FALLBACK22-NEXT: andl $24, %edx -; FALLBACK22-NEXT: andl $60, %ecx -; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi -; FALLBACK22-NEXT: movl 72(%esp,%ecx), %eax +; FALLBACK22-NEXT: leal (,%edx,8), %ecx +; FALLBACK22-NEXT: andl $60, %edx +; FALLBACK22-NEXT: movl 68(%esp,%edx), %esi +; FALLBACK22-NEXT: movl 72(%esp,%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %edi -; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: shrxl %ecx, %esi, %edi +; FALLBACK22-NEXT: movl %ecx, %ebx +; FALLBACK22-NEXT: andb $24, %bl ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: leal (%eax,%eax), %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebp -; FALLBACK22-NEXT: orl %edi, %ebp -; FALLBACK22-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax +; FALLBACK22-NEXT: orl %edi, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl %ecx, %eax +; FALLBACK22-NEXT: shrxl %ecx, 64(%esp,%edx), %edi ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %edi, %esi -; FALLBACK22-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 80(%esp,%ecx), %esi +; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx +; FALLBACK22-NEXT: orl %edi, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 80(%esp,%edx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 76(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp -; FALLBACK22-NEXT: orl %ebp, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shlxl %ebx, %edi, %ecx +; FALLBACK22-NEXT: movl 76(%esp,%edx), %edi +; FALLBACK22-NEXT: shrxl %eax, %edi, %ebp +; FALLBACK22-NEXT: orl %ebp, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi -; FALLBACK22-NEXT: orl %eax, %edi +; FALLBACK22-NEXT: orl %ebp, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 88(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 88(%esp,%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 84(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: movl 84(%esp,%edx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax ; FALLBACK22-NEXT: orl %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 96(%esp,%ecx), %esi +; FALLBACK22-NEXT: movl 96(%esp,%edx), %esi ; FALLBACK22-NEXT: leal (%esi,%esi), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 92(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: movl 92(%esp,%edx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; FALLBACK22-NEXT: addl %edi, %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %edi ; FALLBACK22-NEXT: orl %eax, %edi ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: movl 104(%esp,%ecx), %eax +; FALLBACK22-NEXT: movl 104(%esp,%edx), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%eax,%eax), %edi ; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: movl 100(%esp,%ecx), %edi -; FALLBACK22-NEXT: shrxl %edx, %edi, %ebp +; FALLBACK22-NEXT: movl 100(%esp,%edx), %edi +; FALLBACK22-NEXT: shrxl %ecx, %edi, %ebp ; FALLBACK22-NEXT: orl %ebp, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %esi -; FALLBACK22-NEXT: addl %edi, %edi -; FALLBACK22-NEXT: shlxl %ebx, %edi, %eax -; FALLBACK22-NEXT: orl %esi, %eax -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: shrxl %ecx, %esi, %esi ; FALLBACK22-NEXT: movl %ecx, %eax -; FALLBACK22-NEXT: movl 112(%esp,%ecx), %ecx +; FALLBACK22-NEXT: addl %edi, %edi +; FALLBACK22-NEXT: shlxl %ebx, %edi, %ecx +; FALLBACK22-NEXT: orl %esi, %ecx +; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 112(%esp,%edx), %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal (%ecx,%ecx), %esi ; FALLBACK22-NEXT: shlxl %ebx, %esi, %ecx -; FALLBACK22-NEXT: movl 108(%esp,%eax), %esi -; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, %esi, %ebp +; FALLBACK22-NEXT: movl 108(%esp,%edx), %esi +; FALLBACK22-NEXT: shrxl %eax, %esi, %ebp ; FALLBACK22-NEXT: orl %ebp, %ecx ; FALLBACK22-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload +; FALLBACK22-NEXT: movl %eax, %ebp ; FALLBACK22-NEXT: addl %esi, %esi -; FALLBACK22-NEXT: shlxl %ebx, %esi, %esi -; FALLBACK22-NEXT: orl %ecx, %esi -; FALLBACK22-NEXT: movl 120(%esp,%eax), %ebp -; FALLBACK22-NEXT: leal (%ebp,%ebp), %ecx +; FALLBACK22-NEXT: shlxl %ebx, %esi, %eax +; FALLBACK22-NEXT: orl %ecx, %eax +; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK22-NEXT: movl 120(%esp,%edx), %esi +; FALLBACK22-NEXT: leal (%esi,%esi), %ecx ; FALLBACK22-NEXT: shlxl %ebx, %ecx, %ecx -; FALLBACK22-NEXT: movl 116(%esp,%eax), %eax -; FALLBACK22-NEXT: shrxl %edx, %eax, %edi +; FALLBACK22-NEXT: movl 116(%esp,%edx), %eax +; FALLBACK22-NEXT: shrxl %ebp, %eax, %edi ; FALLBACK22-NEXT: orl %edi, %ecx -; FALLBACK22-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; FALLBACK22-NEXT: shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload ; FALLBACK22-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: addl %eax, %eax ; FALLBACK22-NEXT: shlxl %ebx, %eax, %edi ; FALLBACK22-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload -; FALLBACK22-NEXT: shrxl %edx, %ebp, %eax -; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK22-NEXT: movl 124(%esp,%ebp), %ebp -; FALLBACK22-NEXT: shrxl %edx, %ebp, %edx -; FALLBACK22-NEXT: addl %ebp, %ebp -; FALLBACK22-NEXT: shlxl %ebx, %ebp, %ebx -; FALLBACK22-NEXT: orl %eax, %ebx +; FALLBACK22-NEXT: shrxl %ebp, %esi, %eax +; FALLBACK22-NEXT: movl 124(%esp,%edx), %edx +; FALLBACK22-NEXT: shrxl %ebp, %edx, %ebp +; FALLBACK22-NEXT: addl %edx, %edx +; FALLBACK22-NEXT: shlxl %ebx, %edx, %edx +; FALLBACK22-NEXT: orl %eax, %edx ; FALLBACK22-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK22-NEXT: movl %edx, 60(%eax) -; FALLBACK22-NEXT: movl %ebx, 56(%eax) +; FALLBACK22-NEXT: movl %ebp, 60(%eax) +; FALLBACK22-NEXT: movl %edx, 56(%eax) ; FALLBACK22-NEXT: movl %edi, 48(%eax) ; FALLBACK22-NEXT: movl %ecx, 52(%eax) -; FALLBACK22-NEXT: movl %esi, 40(%eax) +; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK22-NEXT: movl %ecx, 40(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK22-NEXT: movl %ecx, 44(%eax) ; FALLBACK22-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -14458,7 +14457,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shll $3, %ecx -; FALLBACK23-NEXT: andl $24, %ecx ; FALLBACK23-NEXT: shrdl %cl, %edx, %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi @@ -14561,105 +14559,108 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %ecx, %esi ; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK24-NEXT: movl 68(%esp,%esi), %ebx ; FALLBACK24-NEXT: shll $3, %ecx -; FALLBACK24-NEXT: andl $24, %ecx -; FALLBACK24-NEXT: movl %edx, %edi +; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: movl 72(%esp,%esi), %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%eax,%eax), %ebx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: movb %cl, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%eax,%eax), %ebp +; FALLBACK24-NEXT: movl %ecx, %edx +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: andb $24, %dl +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %ebp, %eax -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl 76(%esp,%esi), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl 80(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx +; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: addl %ebx, %ebx -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %edi, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 88(%esp,%esi), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 96(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: movl 104(%esp,%esi), %ecx +; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebp, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: addl %edi, %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK24-NEXT: movl %edi, %ebp ; FALLBACK24-NEXT: movl %eax, %ecx @@ -14667,52 +14668,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: orl %ebp, %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %edi, %edi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx +; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: movl %esi, %ebp ; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl 120(%esp,%ebp), %ebp +; FALLBACK24-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %eax +; FALLBACK24-NEXT: orl %ebx, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: addl %esi, %esi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %eax, %edx +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl %ebp, %eax +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl 124(%esp,%ecx), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %eax, %ebp ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK24-NEXT: shrl %cl, %ebx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %ebx, 60(%eax) -; FALLBACK24-NEXT: movl %edx, 56(%eax) +; FALLBACK24-NEXT: movl %ebp, 56(%eax) ; FALLBACK24-NEXT: movl %esi, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 52(%eax) ; FALLBACK24-NEXT: movl %edi, 40(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 44(%eax) @@ -14761,91 +14765,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: shrdl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %esi -; FALLBACK25-NEXT: shrdl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %ecx, %eax +; FALLBACK25-NEXT: andl $60, %eax +; FALLBACK25-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK25-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi +; FALLBACK25-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK25-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %edi +; FALLBACK25-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: shrdl %cl, %edx, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK25-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK25-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi ; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK25-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %edi, %esi ; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK25-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %ebx +; FALLBACK25-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %edx, %esi +; FALLBACK25-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK25-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %ebx +; FALLBACK25-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK25-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK25-NEXT: shrdl %cl, %edx, %esi +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl %esi, 56(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: shrl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) +; FALLBACK25-NEXT: shrl %cl, %edx +; FALLBACK25-NEXT: movl %edx, 60(%eax) +; FALLBACK25-NEXT: movl %edi, 48(%eax) +; FALLBACK25-NEXT: movl %ebx, 52(%eax) +; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 40(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 44(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 32(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 36(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 24(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 28(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 16(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 20(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 8(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 12(%eax) +; FALLBACK25-NEXT: movl %ebp, (%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 4(%eax) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -14872,13 +14875,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%ecx,8), %edx -; FALLBACK26-NEXT: andl $24, %edx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: shrxl %edx, %esi, %edi ; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: andb $24, %bl ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: leal (%eax,%eax), %ebp ; FALLBACK26-NEXT: shlxl %ebx, %ebp, %ebp @@ -15029,7 +15032,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shll $3, %ecx -; FALLBACK27-NEXT: andl $24, %ecx ; FALLBACK27-NEXT: shrdl %cl, %edx, %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi @@ -15130,105 +15132,108 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %ecx, %esi ; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx +; FALLBACK28-NEXT: movl 68(%esp,%esi), %ebx ; FALLBACK28-NEXT: shll $3, %ecx -; FALLBACK28-NEXT: andl $24, %ecx -; FALLBACK28-NEXT: movl %edx, %edi +; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: movl 72(%esp,%esi), %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%eax,%eax), %ebx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: movb %cl, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%eax,%eax), %ebp +; FALLBACK28-NEXT: movl %ecx, %edx +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: andb $24, %dl +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %ebp, %eax -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl 76(%esp,%esi), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl 80(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx +; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: addl %ebx, %ebx -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %edi, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 88(%esp,%esi), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 96(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: movl 104(%esp,%esi), %ecx +; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal (%ecx,%ecx), %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebp, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: addl %edi, %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi ; FALLBACK28-NEXT: movl %edi, %ebp ; FALLBACK28-NEXT: movl %eax, %ecx @@ -15236,52 +15241,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: orl %ebp, %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %edi, %edi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx +; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: movl %esi, %ebp ; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi ; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl 120(%esp,%ebp), %ebp +; FALLBACK28-NEXT: leal (%ebp,%ebp), %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %eax +; FALLBACK28-NEXT: orl %ebx, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: addl %esi, %esi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %eax, %edx +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl %ebp, %eax +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl 124(%esp,%ecx), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %eax, %ebp ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK28-NEXT: shrl %cl, %ebx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %ebx, 60(%eax) -; FALLBACK28-NEXT: movl %edx, 56(%eax) +; FALLBACK28-NEXT: movl %ebp, 56(%eax) ; FALLBACK28-NEXT: movl %esi, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 52(%eax) ; FALLBACK28-NEXT: movl %edi, 40(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 44(%eax) @@ -15327,91 +15335,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK29-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: shrdl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %esi -; FALLBACK29-NEXT: shrdl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %ecx, %eax +; FALLBACK29-NEXT: andl $60, %eax +; FALLBACK29-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK29-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi +; FALLBACK29-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK29-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %edi +; FALLBACK29-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: shrdl %cl, %edx, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK29-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK29-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi ; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK29-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %edi, %esi ; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK29-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %ebx +; FALLBACK29-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %edx, %esi +; FALLBACK29-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK29-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %ebx +; FALLBACK29-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK29-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK29-NEXT: shrdl %cl, %edx, %esi +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl %esi, 56(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: shrl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) +; FALLBACK29-NEXT: shrl %cl, %edx +; FALLBACK29-NEXT: movl %edx, 60(%eax) +; FALLBACK29-NEXT: movl %edi, 48(%eax) +; FALLBACK29-NEXT: movl %ebx, 52(%eax) +; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 40(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 44(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 32(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 36(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 24(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 28(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 16(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 20(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 8(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 12(%eax) +; FALLBACK29-NEXT: movl %ebp, (%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 4(%eax) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -15435,13 +15442,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: leal (,%edx,8), %ecx -; FALLBACK30-NEXT: andl $24, %ecx ; FALLBACK30-NEXT: andl $60, %edx ; FALLBACK30-NEXT: movl 68(%esp,%edx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%edx), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: shrxl %ecx, %esi, %edi ; FALLBACK30-NEXT: movl %ecx, %ebx +; FALLBACK30-NEXT: andb $24, %bl ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: leal (%eax,%eax), %ebp ; FALLBACK30-NEXT: shlxl %ebx, %ebp, %ebp @@ -15589,7 +15596,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shll $3, %ecx -; FALLBACK31-NEXT: andl $24, %ecx ; FALLBACK31-NEXT: shrdl %cl, %edx, %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi @@ -16050,7 +16056,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rsi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %esi ; FALLBACK0-NEXT: negl %esi ; FALLBACK0-NEXT: movslq %esi, %rbx @@ -16060,6 +16065,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shlq %cl, %r10 ; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: andb $56, %sil ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq %r8, %r9 ; FALLBACK0-NEXT: shrq %r9 @@ -16158,7 +16164,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rsi,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %esi ; FALLBACK1-NEXT: negl %esi ; FALLBACK1-NEXT: movslq %esi, %r9 @@ -16226,27 +16231,27 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rsi,8), %eax -; FALLBACK2-NEXT: andl $56, %eax ; FALLBACK2-NEXT: andl $56, %esi ; FALLBACK2-NEXT: negl %esi ; FALLBACK2-NEXT: movslq %esi, %rsi ; FALLBACK2-NEXT: movq -64(%rsp,%rsi), %r10 ; FALLBACK2-NEXT: movq -56(%rsp,%rsi), %rcx -; FALLBACK2-NEXT: shlxq %rax, %rcx, %r9 +; FALLBACK2-NEXT: shlxq %rax, %rcx, %r8 ; FALLBACK2-NEXT: movq -40(%rsp,%rsi), %rdi ; FALLBACK2-NEXT: shlxq %rax, %rdi, %r11 ; FALLBACK2-NEXT: movq -48(%rsp,%rsi), %r14 ; FALLBACK2-NEXT: shlxq %rax, %r14, %rbx -; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r8 -; FALLBACK2-NEXT: shlxq %rax, %r8, %r15 +; FALLBACK2-NEXT: movq -24(%rsp,%rsi), %r9 +; FALLBACK2-NEXT: shlxq %rax, %r9, %r15 ; FALLBACK2-NEXT: shlxq %rax, %r10, %r12 ; FALLBACK2-NEXT: movl %eax, %r13d +; FALLBACK2-NEXT: andb $56, %r13b ; FALLBACK2-NEXT: notb %r13b ; FALLBACK2-NEXT: shrq %r10 ; FALLBACK2-NEXT: shrxq %r13, %r10, %r10 -; FALLBACK2-NEXT: orq %r9, %r10 -; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r9 -; FALLBACK2-NEXT: shlxq %rax, %r9, %rbp +; FALLBACK2-NEXT: orq %r8, %r10 +; FALLBACK2-NEXT: movq -32(%rsp,%rsi), %r8 +; FALLBACK2-NEXT: shlxq %rax, %r8, %rbp ; FALLBACK2-NEXT: shrq %r14 ; FALLBACK2-NEXT: shrxq %r13, %r14, %r14 ; FALLBACK2-NEXT: orq %r11, %r14 @@ -16256,23 +16261,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: shrq %rcx ; FALLBACK2-NEXT: shrxq %r13, %rcx, %rcx ; FALLBACK2-NEXT: orq %rbx, %rcx -; FALLBACK2-NEXT: shrq %r9 -; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 -; FALLBACK2-NEXT: orq %r15, %r9 +; FALLBACK2-NEXT: shrq %r8 +; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 +; FALLBACK2-NEXT: orq %r15, %r8 ; FALLBACK2-NEXT: shrq %rdi ; FALLBACK2-NEXT: shrxq %r13, %rdi, %rdi ; FALLBACK2-NEXT: orq %rbp, %rdi ; FALLBACK2-NEXT: shrq %rsi ; FALLBACK2-NEXT: shrxq %r13, %rsi, %rsi ; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: shrq %r8 -; FALLBACK2-NEXT: shrxq %r13, %r8, %r8 -; FALLBACK2-NEXT: orq %rax, %r8 +; FALLBACK2-NEXT: shrq %r9 +; FALLBACK2-NEXT: shrxq %r13, %r9, %r9 +; FALLBACK2-NEXT: orq %rax, %r9 ; FALLBACK2-NEXT: movq %r12, (%rdx) -; FALLBACK2-NEXT: movq %r8, 48(%rdx) +; FALLBACK2-NEXT: movq %r9, 48(%rdx) ; FALLBACK2-NEXT: movq %rsi, 56(%rdx) ; FALLBACK2-NEXT: movq %rdi, 32(%rdx) -; FALLBACK2-NEXT: movq %r9, 40(%rdx) +; FALLBACK2-NEXT: movq %r8, 40(%rdx) ; FALLBACK2-NEXT: movq %rcx, 16(%rdx) ; FALLBACK2-NEXT: movq %r14, 24(%rdx) ; FALLBACK2-NEXT: movq %r10, 8(%rdx) @@ -16313,7 +16318,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rsi,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %esi ; FALLBACK3-NEXT: negl %esi ; FALLBACK3-NEXT: movslq %esi, %r8 @@ -16370,7 +16374,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%rcx,8), %eax -; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %ecx ; FALLBACK4-NEXT: negl %ecx ; FALLBACK4-NEXT: movslq %ecx, %r9 @@ -16379,6 +16382,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shlq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: andb $56, %sil ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK4-NEXT: movq %r11, %r8 @@ -16470,7 +16474,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx ; FALLBACK5-NEXT: andl $56, %eax ; FALLBACK5-NEXT: negl %eax ; FALLBACK5-NEXT: movslq %eax, %r8 @@ -16531,14 +16534,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: leal (,%rax,8), %ecx -; FALLBACK6-NEXT: andl $56, %ecx ; FALLBACK6-NEXT: andl $56, %eax ; FALLBACK6-NEXT: negl %eax ; FALLBACK6-NEXT: movslq %eax, %rsi ; FALLBACK6-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK6-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK6-NEXT: shlxq %rcx, %rax, %r15 ; FALLBACK6-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK6-NEXT: shlxq %rcx, %rdi, %r12 ; FALLBACK6-NEXT: movq -24(%rsp,%rsi), %r13 ; FALLBACK6-NEXT: shlxq %rcx, %r13, %r8 ; FALLBACK6-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -16547,16 +16549,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq -40(%rsp,%rsi), %r14 ; FALLBACK6-NEXT: shlxq %rcx, %r14, %rbx ; FALLBACK6-NEXT: movl %ecx, %r9d +; FALLBACK6-NEXT: andb $56, %r9b ; FALLBACK6-NEXT: notb %r9b ; FALLBACK6-NEXT: shrq %rdi ; FALLBACK6-NEXT: shrxq %r9, %rdi, %rdi -; FALLBACK6-NEXT: orq %r12, %rdi +; FALLBACK6-NEXT: orq %r15, %rdi ; FALLBACK6-NEXT: movq (%rsp,%rsi), %rbp ; FALLBACK6-NEXT: shlxq %rcx, %rbp, %r8 ; FALLBACK6-NEXT: shrq %r13 -; FALLBACK6-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK6-NEXT: orq %r15, %r12 -; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK6-NEXT: shrxq %r9, %r13, %r15 +; FALLBACK6-NEXT: orq %r12, %r15 +; FALLBACK6-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r12 ; FALLBACK6-NEXT: movq -48(%rsp,%rsi), %rsi ; FALLBACK6-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK6-NEXT: shrq %r11 @@ -16573,14 +16576,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: orq %r8, %rax ; FALLBACK6-NEXT: shrq %rbp ; FALLBACK6-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK6-NEXT: orq %r15, %r8 +; FALLBACK6-NEXT: orq %r12, %r8 ; FALLBACK6-NEXT: movq %rcx, (%rdx) ; FALLBACK6-NEXT: movq %r8, 56(%rdx) ; FALLBACK6-NEXT: movq %rax, 48(%rdx) ; FALLBACK6-NEXT: movq %rsi, 8(%rdx) ; FALLBACK6-NEXT: movq %r14, 16(%rdx) ; FALLBACK6-NEXT: movq %r11, 24(%rdx) -; FALLBACK6-NEXT: movq %r12, 32(%rdx) +; FALLBACK6-NEXT: movq %r15, 32(%rdx) ; FALLBACK6-NEXT: movq %rdi, 40(%rdx) ; FALLBACK6-NEXT: addq $24, %rsp ; FALLBACK6-NEXT: popq %rbx @@ -16611,7 +16614,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx ; FALLBACK7-NEXT: andl $56, %eax ; FALLBACK7-NEXT: negl %eax ; FALLBACK7-NEXT: movslq %eax, %r8 @@ -16663,7 +16665,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%rcx,8), %eax -; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %ecx ; FALLBACK8-NEXT: negl %ecx ; FALLBACK8-NEXT: movslq %ecx, %r9 @@ -16672,6 +16673,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shlq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: andb $56, %sil ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK8-NEXT: movq %r11, %r8 @@ -16758,7 +16760,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx ; FALLBACK9-NEXT: andl $56, %eax ; FALLBACK9-NEXT: negl %eax ; FALLBACK9-NEXT: movslq %eax, %r8 @@ -16814,14 +16815,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: leal (,%rax,8), %ecx -; FALLBACK10-NEXT: andl $56, %ecx ; FALLBACK10-NEXT: andl $56, %eax ; FALLBACK10-NEXT: negl %eax ; FALLBACK10-NEXT: movslq %eax, %rsi ; FALLBACK10-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK10-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK10-NEXT: shlxq %rcx, %rax, %r15 ; FALLBACK10-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK10-NEXT: shlxq %rcx, %rdi, %r12 ; FALLBACK10-NEXT: movq -24(%rsp,%rsi), %r13 ; FALLBACK10-NEXT: shlxq %rcx, %r13, %r8 ; FALLBACK10-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -16830,16 +16830,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq -40(%rsp,%rsi), %r14 ; FALLBACK10-NEXT: shlxq %rcx, %r14, %rbx ; FALLBACK10-NEXT: movl %ecx, %r9d +; FALLBACK10-NEXT: andb $56, %r9b ; FALLBACK10-NEXT: notb %r9b ; FALLBACK10-NEXT: shrq %rdi ; FALLBACK10-NEXT: shrxq %r9, %rdi, %rdi -; FALLBACK10-NEXT: orq %r12, %rdi +; FALLBACK10-NEXT: orq %r15, %rdi ; FALLBACK10-NEXT: movq (%rsp,%rsi), %rbp ; FALLBACK10-NEXT: shlxq %rcx, %rbp, %r8 ; FALLBACK10-NEXT: shrq %r13 -; FALLBACK10-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK10-NEXT: orq %r15, %r12 -; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK10-NEXT: shrxq %r9, %r13, %r15 +; FALLBACK10-NEXT: orq %r12, %r15 +; FALLBACK10-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r12 ; FALLBACK10-NEXT: movq -48(%rsp,%rsi), %rsi ; FALLBACK10-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK10-NEXT: shrq %r11 @@ -16856,14 +16857,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: orq %r8, %rax ; FALLBACK10-NEXT: shrq %rbp ; FALLBACK10-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK10-NEXT: orq %r15, %r8 +; FALLBACK10-NEXT: orq %r12, %r8 ; FALLBACK10-NEXT: movq %rcx, (%rdx) ; FALLBACK10-NEXT: movq %r8, 56(%rdx) ; FALLBACK10-NEXT: movq %rax, 48(%rdx) ; FALLBACK10-NEXT: movq %rsi, 8(%rdx) ; FALLBACK10-NEXT: movq %r14, 16(%rdx) ; FALLBACK10-NEXT: movq %r11, 24(%rdx) -; FALLBACK10-NEXT: movq %r12, 32(%rdx) +; FALLBACK10-NEXT: movq %r15, 32(%rdx) ; FALLBACK10-NEXT: movq %rdi, 40(%rdx) ; FALLBACK10-NEXT: addq $24, %rsp ; FALLBACK10-NEXT: popq %rbx @@ -16889,7 +16890,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx ; FALLBACK11-NEXT: andl $56, %eax ; FALLBACK11-NEXT: negl %eax ; FALLBACK11-NEXT: movslq %eax, %r8 @@ -16939,7 +16939,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: leal (,%rcx,8), %eax -; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %ecx ; FALLBACK12-NEXT: negl %ecx ; FALLBACK12-NEXT: movslq %ecx, %r9 @@ -16948,6 +16947,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shlq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: andb $56, %sil ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: movq -32(%rsp,%r9), %r11 ; FALLBACK12-NEXT: movq %r11, %r8 @@ -17031,7 +17031,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx ; FALLBACK13-NEXT: andl $56, %eax ; FALLBACK13-NEXT: negl %eax ; FALLBACK13-NEXT: movslq %eax, %r8 @@ -17084,14 +17083,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: leal (,%rax,8), %ecx -; FALLBACK14-NEXT: andl $56, %ecx ; FALLBACK14-NEXT: andl $56, %eax ; FALLBACK14-NEXT: negl %eax ; FALLBACK14-NEXT: movslq %eax, %rsi ; FALLBACK14-NEXT: movq -8(%rsp,%rsi), %rax -; FALLBACK14-NEXT: shlxq %rcx, %rax, %r12 +; FALLBACK14-NEXT: shlxq %rcx, %rax, %r15 ; FALLBACK14-NEXT: movq -16(%rsp,%rsi), %rdi -; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r15 +; FALLBACK14-NEXT: shlxq %rcx, %rdi, %r12 ; FALLBACK14-NEXT: movq -24(%rsp,%rsi), %r13 ; FALLBACK14-NEXT: shlxq %rcx, %r13, %r8 ; FALLBACK14-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill @@ -17100,16 +17098,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq -40(%rsp,%rsi), %r14 ; FALLBACK14-NEXT: shlxq %rcx, %r14, %rbx ; FALLBACK14-NEXT: movl %ecx, %r9d +; FALLBACK14-NEXT: andb $56, %r9b ; FALLBACK14-NEXT: notb %r9b ; FALLBACK14-NEXT: shrq %rdi ; FALLBACK14-NEXT: shrxq %r9, %rdi, %rdi -; FALLBACK14-NEXT: orq %r12, %rdi +; FALLBACK14-NEXT: orq %r15, %rdi ; FALLBACK14-NEXT: movq (%rsp,%rsi), %rbp ; FALLBACK14-NEXT: shlxq %rcx, %rbp, %r8 ; FALLBACK14-NEXT: shrq %r13 -; FALLBACK14-NEXT: shrxq %r9, %r13, %r12 -; FALLBACK14-NEXT: orq %r15, %r12 -; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r15 +; FALLBACK14-NEXT: shrxq %r9, %r13, %r15 +; FALLBACK14-NEXT: orq %r12, %r15 +; FALLBACK14-NEXT: shlxq %rcx, 8(%rsp,%rsi), %r12 ; FALLBACK14-NEXT: movq -48(%rsp,%rsi), %rsi ; FALLBACK14-NEXT: shlxq %rcx, %rsi, %rcx ; FALLBACK14-NEXT: shrq %r11 @@ -17126,14 +17125,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: orq %r8, %rax ; FALLBACK14-NEXT: shrq %rbp ; FALLBACK14-NEXT: shrxq %r9, %rbp, %r8 -; FALLBACK14-NEXT: orq %r15, %r8 +; FALLBACK14-NEXT: orq %r12, %r8 ; FALLBACK14-NEXT: movq %rcx, (%rdx) ; FALLBACK14-NEXT: movq %r8, 56(%rdx) ; FALLBACK14-NEXT: movq %rax, 48(%rdx) ; FALLBACK14-NEXT: movq %rsi, 8(%rdx) ; FALLBACK14-NEXT: movq %r14, 16(%rdx) ; FALLBACK14-NEXT: movq %r11, 24(%rdx) -; FALLBACK14-NEXT: movq %r12, 32(%rdx) +; FALLBACK14-NEXT: movq %r15, 32(%rdx) ; FALLBACK14-NEXT: movq %rdi, 40(%rdx) ; FALLBACK14-NEXT: addq $24, %rsp ; FALLBACK14-NEXT: popq %rbx @@ -17156,7 +17155,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: vmovups %zmm0, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx ; FALLBACK15-NEXT: andl $56, %eax ; FALLBACK15-NEXT: negl %eax ; FALLBACK15-NEXT: movslq %eax, %r8 @@ -17202,7 +17200,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: subl $204, %esp ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl (%eax), %ecx -; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ecx, (%esp) # 4-byte Spill ; FALLBACK16-NEXT: movl 4(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 8(%eax), %ecx @@ -17221,233 +17219,240 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 36(%eax), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 40(%eax), %ebp -; FALLBACK16-NEXT: movl 44(%eax), %ebx -; FALLBACK16-NEXT: movl 48(%eax), %edi -; FALLBACK16-NEXT: movl 52(%eax), %esi -; FALLBACK16-NEXT: movl 56(%eax), %edx -; FALLBACK16-NEXT: movl 60(%eax), %ecx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl (%eax), %eax +; FALLBACK16-NEXT: movl 40(%eax), %ebx +; FALLBACK16-NEXT: movl 44(%eax), %edi +; FALLBACK16-NEXT: movl 48(%eax), %esi +; FALLBACK16-NEXT: movl 52(%eax), %edx +; FALLBACK16-NEXT: movl 56(%eax), %ecx +; FALLBACK16-NEXT: movl 60(%eax), %eax +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ebp +; FALLBACK16-NEXT: movl (%ebp), %ebp ; FALLBACK16-NEXT: xorps %xmm0, %xmm0 ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ebx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %eax, %edx -; FALLBACK16-NEXT: andl $60, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK16-NEXT: subl %edx, %ecx -; FALLBACK16-NEXT: movl (%ecx), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 4(%ecx), %edx -; FALLBACK16-NEXT: movl %ecx, %ebp -; FALLBACK16-NEXT: shll $3, %eax -; FALLBACK16-NEXT: andl $24, %eax -; FALLBACK16-NEXT: movl %edx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %al, %ch -; FALLBACK16-NEXT: notb %ch -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 12(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 8(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, %edi +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK16-NEXT: movl %ebp, %ebx +; FALLBACK16-NEXT: movl %ebp, %ecx +; FALLBACK16-NEXT: andl $60, %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal {{[0-9]+}}(%esp), %eax +; FALLBACK16-NEXT: subl %ecx, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl (%eax), %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 4(%eax), %edi +; FALLBACK16-NEXT: shll $3, %ebx +; FALLBACK16-NEXT: movl %edi, %eax +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movl %ebx, %edx +; FALLBACK16-NEXT: andb $24, %dl +; FALLBACK16-NEXT: notb %dl +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK16-NEXT: movl 12(%esi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 8(%esi), %esi ; FALLBACK16-NEXT: movl %esi, %ebp ; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %ebx, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edi, %ebp -; FALLBACK16-NEXT: movl 20(%edi), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 16(%edi), %esi -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: movl %ebx, %ebp ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi ; FALLBACK16-NEXT: orl %esi, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl 28(%ebp), %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: movl 20(%edi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 16(%edi), %esi +; FALLBACK16-NEXT: movl %esi, %ebx +; FALLBACK16-NEXT: shrl %ebx +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %ebx +; FALLBACK16-NEXT: orl %eax, %ebx ; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 24(%ebp), %esi -; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %ebp, %ebx +; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %esi, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 36(%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl 32(%edx), %esi -; FALLBACK16-NEXT: movl %edx, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 28(%edi), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 24(%edi), %ebp +; FALLBACK16-NEXT: movl %ebp, %esi +; FALLBACK16-NEXT: shrl %esi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %ebp, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; FALLBACK16-NEXT: movl 36(%ebp), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: movl 32(%ebp), %esi ; FALLBACK16-NEXT: movl %esi, %edi ; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: orl %eax, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 44(%ebp), %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %ebx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 44(%ebp), %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: shll %cl, %eax ; FALLBACK16-NEXT: movl 40(%ebp), %esi -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %esi, %edi +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %eax, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, (%esp) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %esi, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: shrl %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: orl %esi, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl 52(%ebp), %esi ; FALLBACK16-NEXT: movl %esi, %edi -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %ebx, %ecx ; FALLBACK16-NEXT: shll %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: negl %edx -; FALLBACK16-NEXT: movl 176(%esp,%edx), %ebx -; FALLBACK16-NEXT: movl %ebx, %ebp -; FALLBACK16-NEXT: shrl %ebp -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %ebp -; FALLBACK16-NEXT: orl %edi, %ebp -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: negl %eax +; FALLBACK16-NEXT: movl 176(%esp,%eax), %ebx +; FALLBACK16-NEXT: movl %ebx, %eax +; FALLBACK16-NEXT: shrl %eax +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: orl %edi, %eax +; FALLBACK16-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: shrl %edx -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: orl %ebx, %edx -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK16-NEXT: movl 60(%edi), %edx -; FALLBACK16-NEXT: movb %al, %cl -; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: shrl %edi +; FALLBACK16-NEXT: movl %edx, %ecx +; FALLBACK16-NEXT: shrl %cl, %edi +; FALLBACK16-NEXT: orl %ebx, %edi +; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %ebp, %edi +; FALLBACK16-NEXT: movl 60(%ebp), %ebp +; FALLBACK16-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: movl 56(%edi), %ebx ; FALLBACK16-NEXT: movl %ebx, %edi ; FALLBACK16-NEXT: shrl %edi -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: orl %edx, %edi -; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: orl %ebp, %edi +; FALLBACK16-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK16-NEXT: shll %cl, %ebx ; FALLBACK16-NEXT: shrl %esi -; FALLBACK16-NEXT: movb %ch, %cl +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: orl %ebx, %esi -; FALLBACK16-NEXT: movl %eax, %ecx +; FALLBACK16-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax -; FALLBACK16-NEXT: movl %edx, (%eax) -; FALLBACK16-NEXT: movl %esi, 56(%eax) -; FALLBACK16-NEXT: movl %edi, 60(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 40(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 44(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 32(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 36(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 24(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 28(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 16(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 20(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 8(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 12(%eax) -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: movl %ecx, 4(%eax) +; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %ecx +; FALLBACK16-NEXT: movl %edx, (%ecx) +; FALLBACK16-NEXT: movl %esi, 56(%ecx) +; FALLBACK16-NEXT: movl %edi, 60(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: movl %edx, 48(%ecx) +; FALLBACK16-NEXT: movl %eax, 52(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 40(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 44(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 32(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 36(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 24(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 28(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 16(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 20(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 8(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 12(%ecx) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl %eax, 4(%ecx) ; FALLBACK16-NEXT: addl $204, %esp ; FALLBACK16-NEXT: popl %esi ; FALLBACK16-NEXT: popl %edi @@ -17529,7 +17534,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl 8(%eax), %esi ; FALLBACK17-NEXT: movl 12(%eax), %edx ; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx ; FALLBACK17-NEXT: movl %edx, %edi ; FALLBACK17-NEXT: shldl %cl, %esi, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -17682,7 +17686,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK18-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: leal (,%ebp,8), %edx -; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ebp ; FALLBACK18-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: leal {{[0-9]+}}(%esp), %edi @@ -17692,6 +17695,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl 4(%edi), %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: andb $24, %bl ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: shrl %ecx ; FALLBACK18-NEXT: shrxl %ebx, %ecx, %esi @@ -17894,7 +17898,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK19-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK19-NEXT: leal (,%ebp,8), %ecx -; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: andl $60, %ebp ; FALLBACK19-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK19-NEXT: subl %ebp, %eax @@ -17999,7 +18002,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK20-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK20-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK20-NEXT: movl (%eax), %eax +; FALLBACK20-NEXT: movl (%eax), %ecx ; FALLBACK20-NEXT: xorps %xmm4, %xmm4 ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -18009,160 +18012,172 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: andl $60, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK20-NEXT: subl %edx, %ecx -; FALLBACK20-NEXT: movl (%ecx), %edi +; FALLBACK20-NEXT: movl %ecx, %eax +; FALLBACK20-NEXT: andl $60, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK20-NEXT: subl %eax, %edx +; FALLBACK20-NEXT: movl (%edx), %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 4(%ecx), %edx -; FALLBACK20-NEXT: movl %ecx, %ebp -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %esi -; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl 4(%edx), %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edx, %ebx +; FALLBACK20-NEXT: shll $3, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %al, %ch -; FALLBACK20-NEXT: notb %ch -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %ecx, %edx +; FALLBACK20-NEXT: movl %ecx, %eax +; FALLBACK20-NEXT: andb $24, %dl +; FALLBACK20-NEXT: notb %dl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %esi, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 12(%ebp), %ebx +; FALLBACK20-NEXT: movl %ebx, %esi ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl 12(%ebx), %ebx +; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 8(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, %edi +; FALLBACK20-NEXT: movl 8(%esi), %esi ; FALLBACK20-NEXT: movl %esi, %ebp ; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %ebx, %ebp ; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl %eax, %edi ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl 20(%edi), %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %esi, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK20-NEXT: movl 20(%esi), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %edi, %eax +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 16(%edi), %esi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl 16(%esi), %esi +; FALLBACK20-NEXT: movl %esi, %ebp +; FALLBACK20-NEXT: shrl %ebp +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %ebp +; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %esi ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi ; FALLBACK20-NEXT: orl %esi, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %ebp, %edx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK20-NEXT: movl 28(%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: movl %eax, %edi ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 24(%ebp), %esi -; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi -; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %ebx, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edi, %ecx +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: orl %esi, %ebp -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 36(%edx), %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %esi, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 36(%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %edi, %eax +; FALLBACK20-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl 32(%edx), %esi -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movl %esi, %edi +; FALLBACK20-NEXT: movl 32(%ebp), %ebp +; FALLBACK20-NEXT: movl %ebp, %esi +; FALLBACK20-NEXT: shrl %esi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: orl %ebx, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %eax, %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %ebx, %edi +; FALLBACK20-NEXT: orl %ebp, %edi ; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK20-NEXT: movl 44(%ebp), %ebx ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: movl 40(%ebp), %esi -; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %ebx, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %esi, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK20-NEXT: shrl %edi +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %edi +; FALLBACK20-NEXT: orl %esi, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl 52(%ebp), %esi ; FALLBACK20-NEXT: movl %esi, %edi -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: negl %edx -; FALLBACK20-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: negl %eax +; FALLBACK20-NEXT: movl 176(%esp,%eax), %ebx ; FALLBACK20-NEXT: movl %ebx, %ebp ; FALLBACK20-NEXT: shrl %ebp -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %ebp ; FALLBACK20-NEXT: orl %edi, %ebp -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: shrl %edx -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shrl %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: orl %ebx, %eax +; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK20-NEXT: movl 60(%edi), %edx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shll %cl, %edx +; FALLBACK20-NEXT: movl 60(%edi), %eax +; FALLBACK20-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK20-NEXT: shll %cl, %eax ; FALLBACK20-NEXT: movl 56(%edi), %ebx ; FALLBACK20-NEXT: movl %ebx, %edi ; FALLBACK20-NEXT: shrl %edi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: orl %eax, %edi +; FALLBACK20-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: shll %cl, %ebx ; FALLBACK20-NEXT: shrl %esi -; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: movl %edx, %ecx ; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: orl %ebx, %esi ; FALLBACK20-NEXT: movl %eax, %ecx @@ -18234,7 +18249,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl 8(%eax), %esi ; FALLBACK21-NEXT: movl 12(%eax), %edx ; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx ; FALLBACK21-NEXT: movl %edx, %edi ; FALLBACK21-NEXT: shldl %cl, %esi, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -18343,7 +18357,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx ; FALLBACK22-NEXT: andl $60, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: leal {{[0-9]+}}(%esp), %edi @@ -18353,6 +18366,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl 4(%edi), %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: andb $24, %bl ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: shrl %ecx ; FALLBACK22-NEXT: shrxl %ebx, %ecx, %esi @@ -18500,7 +18514,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movups 16(%ecx), %xmm1 ; FALLBACK23-NEXT: movups 32(%ecx), %xmm2 ; FALLBACK23-NEXT: movups 48(%ecx), %xmm3 -; FALLBACK23-NEXT: movl (%eax), %ebp +; FALLBACK23-NEXT: movl (%eax), %ebx ; FALLBACK23-NEXT: xorps %xmm4, %xmm4 ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm4, {{[0-9]+}}(%esp) @@ -18510,47 +18524,46 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK23-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; FALLBACK23-NEXT: leal (,%ebp,8), %ecx -; FALLBACK23-NEXT: andl $24, %ecx -; FALLBACK23-NEXT: andl $60, %ebp +; FALLBACK23-NEXT: leal (,%ebx,8), %ecx +; FALLBACK23-NEXT: andl $60, %ebx ; FALLBACK23-NEXT: leal {{[0-9]+}}(%esp), %eax -; FALLBACK23-NEXT: subl %ebp, %eax +; FALLBACK23-NEXT: subl %ebx, %eax ; FALLBACK23-NEXT: movl 4(%eax), %esi ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 8(%eax), %edi ; FALLBACK23-NEXT: movl 12(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl %edx, %ebp +; FALLBACK23-NEXT: shldl %cl, %edi, %ebp +; FALLBACK23-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %esi, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 16(%eax), %edi ; FALLBACK23-NEXT: movl 20(%eax), %esi -; FALLBACK23-NEXT: movl %esi, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl %esi, %ebp +; FALLBACK23-NEXT: shldl %cl, %edi, %ebp +; FALLBACK23-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %edx, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 24(%eax), %edi ; FALLBACK23-NEXT: movl 28(%eax), %edx -; FALLBACK23-NEXT: movl %edx, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl %edx, %ebp +; FALLBACK23-NEXT: shldl %cl, %edi, %ebp +; FALLBACK23-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %esi, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 32(%eax), %edi ; FALLBACK23-NEXT: movl 36(%eax), %esi -; FALLBACK23-NEXT: movl %esi, %ebx -; FALLBACK23-NEXT: shldl %cl, %edi, %ebx -; FALLBACK23-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK23-NEXT: movl %esi, %ebp +; FALLBACK23-NEXT: shldl %cl, %edi, %ebp +; FALLBACK23-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shldl %cl, %edx, %edi ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: movl 40(%eax), %ebx +; FALLBACK23-NEXT: movl 40(%eax), %ebp ; FALLBACK23-NEXT: movl 44(%eax), %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %ebx, %edx +; FALLBACK23-NEXT: shldl %cl, %ebp, %edx ; FALLBACK23-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK23-NEXT: shldl %cl, %esi, %ebx +; FALLBACK23-NEXT: shldl %cl, %esi, %ebp ; FALLBACK23-NEXT: movl 56(%eax), %edx ; FALLBACK23-NEXT: movl 60(%eax), %edi ; FALLBACK23-NEXT: shldl %cl, %edx, %edi @@ -18558,8 +18571,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 52(%eax), %esi ; FALLBACK23-NEXT: shldl %cl, %esi, %edx -; FALLBACK23-NEXT: negl %ebp -; FALLBACK23-NEXT: movl 176(%esp,%ebp), %ebp +; FALLBACK23-NEXT: negl %ebx +; FALLBACK23-NEXT: movl 176(%esp,%ebx), %ebx ; FALLBACK23-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK23-NEXT: movl %edx, 56(%eax) ; FALLBACK23-NEXT: movl %edi, 60(%eax) @@ -18568,13 +18581,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK23-NEXT: shldl %cl, %edx, %edi -; FALLBACK23-NEXT: shldl %cl, %ebp, %esi +; FALLBACK23-NEXT: shldl %cl, %ebx, %esi ; FALLBACK23-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK23-NEXT: shldl %cl, %edx, %ebp -; FALLBACK23-NEXT: movl %ebp, 48(%eax) +; FALLBACK23-NEXT: shldl %cl, %edx, %ebx +; FALLBACK23-NEXT: movl %ebx, 48(%eax) ; FALLBACK23-NEXT: movl %esi, 52(%eax) -; FALLBACK23-NEXT: movl %ebx, 40(%eax) +; FALLBACK23-NEXT: movl %ebp, 40(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK23-NEXT: movl %ecx, 44(%eax) ; FALLBACK23-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -18614,166 +18627,178 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 ; FALLBACK24-NEXT: vmovups 32(%ecx), %ymm1 -; FALLBACK24-NEXT: movl (%eax), %eax +; FALLBACK24-NEXT: movl (%eax), %ecx ; FALLBACK24-NEXT: vxorps %xmm2, %xmm2, %xmm2 ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm2, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: andl $60, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: subl %edx, %ecx -; FALLBACK24-NEXT: movl (%ecx), %edi +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: andl $60, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK24-NEXT: subl %eax, %edx +; FALLBACK24-NEXT: movl (%edx), %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 4(%ecx), %edx -; FALLBACK24-NEXT: movl %ecx, %ebp -; FALLBACK24-NEXT: shll $3, %eax -; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %esi -; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl 4(%edx), %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edx, %ebx +; FALLBACK24-NEXT: shll $3, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %al, %ch -; FALLBACK24-NEXT: notb %ch -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %ecx, %edx +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: andb $24, %dl +; FALLBACK24-NEXT: notb %dl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %esi, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 12(%ebp), %ebx +; FALLBACK24-NEXT: movl %ebx, %esi ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl 12(%ebx), %ebx +; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 8(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, %edi +; FALLBACK24-NEXT: movl 8(%esi), %esi ; FALLBACK24-NEXT: movl %esi, %ebp ; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %ebx, %ebp ; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl %eax, %edi ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl 20(%edi), %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %esi, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK24-NEXT: movl 20(%esi), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %edi, %eax +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 16(%edi), %esi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl 16(%esi), %esi +; FALLBACK24-NEXT: movl %esi, %ebp +; FALLBACK24-NEXT: shrl %ebp +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %ebp +; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %esi ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi ; FALLBACK24-NEXT: orl %esi, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %ebp, %edx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK24-NEXT: movl 28(%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: movl %eax, %edi ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 24(%ebp), %esi -; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi -; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %ebx, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edi, %ecx +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: orl %esi, %ebp -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 36(%edx), %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %esi, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 36(%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %edi, %eax +; FALLBACK24-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl 32(%edx), %esi -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movl %esi, %edi +; FALLBACK24-NEXT: movl 32(%ebp), %ebp +; FALLBACK24-NEXT: movl %ebp, %esi +; FALLBACK24-NEXT: shrl %esi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: orl %ebx, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %eax, %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %ebx, %edi +; FALLBACK24-NEXT: orl %ebp, %edi ; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK24-NEXT: movl 44(%ebp), %ebx ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: movl 40(%ebp), %esi -; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %ebx, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %esi, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK24-NEXT: shrl %edi +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %edi +; FALLBACK24-NEXT: orl %esi, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl 52(%ebp), %esi ; FALLBACK24-NEXT: movl %esi, %edi -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: negl %edx -; FALLBACK24-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: negl %eax +; FALLBACK24-NEXT: movl 176(%esp,%eax), %ebx ; FALLBACK24-NEXT: movl %ebx, %ebp ; FALLBACK24-NEXT: shrl %ebp -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %ebp ; FALLBACK24-NEXT: orl %edi, %ebp -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: shrl %edx -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shrl %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: orl %ebx, %eax +; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK24-NEXT: movl 60(%edi), %edx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shll %cl, %edx +; FALLBACK24-NEXT: movl 60(%edi), %eax +; FALLBACK24-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK24-NEXT: shll %cl, %eax ; FALLBACK24-NEXT: movl 56(%edi), %ebx ; FALLBACK24-NEXT: movl %ebx, %edi ; FALLBACK24-NEXT: shrl %edi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: orl %eax, %edi +; FALLBACK24-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: shll %cl, %ebx ; FALLBACK24-NEXT: shrl %esi -; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: movl %edx, %ecx ; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: orl %ebx, %esi ; FALLBACK24-NEXT: movl %eax, %ecx @@ -18840,7 +18865,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl 8(%eax), %esi ; FALLBACK25-NEXT: movl 12(%eax), %edx ; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx ; FALLBACK25-NEXT: movl %edx, %edi ; FALLBACK25-NEXT: shldl %cl, %esi, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -18944,7 +18968,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx ; FALLBACK26-NEXT: andl $60, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: leal {{[0-9]+}}(%esp), %edi @@ -18954,6 +18977,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl 4(%edi), %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: andb $24, %bl ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: shrl %ecx ; FALLBACK26-NEXT: shrxl %ebx, %ecx, %esi @@ -19107,7 +19131,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: vmovups %ymm1, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) ; FALLBACK27-NEXT: leal (,%ebx,8), %ecx -; FALLBACK27-NEXT: andl $24, %ecx ; FALLBACK27-NEXT: andl $60, %ebx ; FALLBACK27-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK27-NEXT: subl %ebx, %eax @@ -19210,164 +19233,176 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx ; FALLBACK28-NEXT: vmovups (%ecx), %zmm0 -; FALLBACK28-NEXT: movl (%eax), %eax +; FALLBACK28-NEXT: movl (%eax), %ecx ; FALLBACK28-NEXT: vxorps %xmm1, %xmm1, %xmm1 ; FALLBACK28-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: andl $60, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: subl %edx, %ecx -; FALLBACK28-NEXT: movl (%ecx), %edi +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: andl $60, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: leal {{[0-9]+}}(%esp), %edx +; FALLBACK28-NEXT: subl %eax, %edx +; FALLBACK28-NEXT: movl (%edx), %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 4(%ecx), %edx -; FALLBACK28-NEXT: movl %ecx, %ebp -; FALLBACK28-NEXT: shll $3, %eax -; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %esi -; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl 4(%edx), %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edx, %ebx +; FALLBACK28-NEXT: shll $3, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %al, %ch -; FALLBACK28-NEXT: notb %ch -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %ecx, %edx +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: andb $24, %dl +; FALLBACK28-NEXT: notb %dl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %esi, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 12(%ebp), %ebx +; FALLBACK28-NEXT: movl %ebx, %esi ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl 12(%ebx), %ebx +; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 8(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, %edi +; FALLBACK28-NEXT: movl 8(%esi), %esi ; FALLBACK28-NEXT: movl %esi, %ebp ; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %ebx, %ebp ; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl %eax, %edi ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl 20(%edi), %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %esi, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK28-NEXT: movl 20(%esi), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %edi, %eax +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 16(%edi), %esi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl 16(%esi), %esi +; FALLBACK28-NEXT: movl %esi, %ebp +; FALLBACK28-NEXT: shrl %ebp +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %ebp +; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %esi ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi ; FALLBACK28-NEXT: orl %esi, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %ebp, %edx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK28-NEXT: movl 28(%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: movl %eax, %edi ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 24(%ebp), %esi -; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi -; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %ebx, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edi, %ecx +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: orl %esi, %ebp -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 36(%edx), %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %esi, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 36(%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %edi, %eax +; FALLBACK28-NEXT: movl %edi, (%esp) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl 32(%edx), %esi -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movl %esi, %edi +; FALLBACK28-NEXT: movl 32(%ebp), %ebp +; FALLBACK28-NEXT: movl %ebp, %esi +; FALLBACK28-NEXT: shrl %esi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: orl %ebx, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %eax, %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %ebx, %edi +; FALLBACK28-NEXT: orl %ebp, %edi ; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; FALLBACK28-NEXT: movl 44(%ebp), %ebx ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: movl 40(%ebp), %esi -; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %ebx, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %esi, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; FALLBACK28-NEXT: shrl %edi +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %edi +; FALLBACK28-NEXT: orl %esi, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl 52(%ebp), %esi ; FALLBACK28-NEXT: movl %esi, %edi -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: negl %edx -; FALLBACK28-NEXT: movl 176(%esp,%edx), %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: negl %eax +; FALLBACK28-NEXT: movl 176(%esp,%eax), %ebx ; FALLBACK28-NEXT: movl %ebx, %ebp ; FALLBACK28-NEXT: shrl %ebp -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %ebp ; FALLBACK28-NEXT: orl %edi, %ebp -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx ; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: shrl %edx -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shrl %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: orl %ebx, %eax +; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; FALLBACK28-NEXT: movl 60(%edi), %edx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shll %cl, %edx +; FALLBACK28-NEXT: movl 60(%edi), %eax +; FALLBACK28-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK28-NEXT: shll %cl, %eax ; FALLBACK28-NEXT: movl 56(%edi), %ebx ; FALLBACK28-NEXT: movl %ebx, %edi ; FALLBACK28-NEXT: shrl %edi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: orl %eax, %edi +; FALLBACK28-NEXT: movl (%esp), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: shll %cl, %ebx ; FALLBACK28-NEXT: shrl %esi -; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: movl %edx, %ecx ; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: orl %ebx, %esi ; FALLBACK28-NEXT: movl %eax, %ecx @@ -19431,7 +19466,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl 8(%eax), %esi ; FALLBACK29-NEXT: movl 12(%eax), %edx ; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx ; FALLBACK29-NEXT: movl %edx, %edi ; FALLBACK29-NEXT: shldl %cl, %esi, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -19532,7 +19566,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx ; FALLBACK30-NEXT: andl $60, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: leal {{[0-9]+}}(%esp), %edi @@ -19542,6 +19575,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl 4(%edi), %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: andb $24, %bl ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: shrl %ecx ; FALLBACK30-NEXT: shrxl %ebx, %ecx, %esi @@ -19692,7 +19726,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: vmovups %zmm1, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: vmovups %zmm0, {{[0-9]+}}(%esp) ; FALLBACK31-NEXT: leal (,%ebx,8), %ecx -; FALLBACK31-NEXT: andl $24, %ecx ; FALLBACK31-NEXT: andl $60, %ebx ; FALLBACK31-NEXT: leal {{[0-9]+}}(%esp), %eax ; FALLBACK31-NEXT: subl %ebx, %eax @@ -20193,7 +20226,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; FALLBACK0-NEXT: leal (,%rdi,8), %eax -; FALLBACK0-NEXT: andl $56, %eax ; FALLBACK0-NEXT: andl $56, %edi ; FALLBACK0-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK0-NEXT: movq -120(%rsp,%rdi), %r8 @@ -20201,6 +20233,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK0-NEXT: movl %eax, %ecx ; FALLBACK0-NEXT: shrq %cl, %r11 ; FALLBACK0-NEXT: movl %eax, %esi +; FALLBACK0-NEXT: andb $56, %sil ; FALLBACK0-NEXT: notb %sil ; FALLBACK0-NEXT: movq -112(%rsp,%rdi), %rbx ; FALLBACK0-NEXT: leaq (%rbx,%rbx), %r9 @@ -20299,7 +20332,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK1-NEXT: leal (,%rax,8), %ecx -; FALLBACK1-NEXT: andl $56, %ecx ; FALLBACK1-NEXT: andl $56, %eax ; FALLBACK1-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK1-NEXT: movq -128(%rsp,%rax), %rsi @@ -20370,54 +20402,54 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK2-NEXT: leal (,%rax,8), %ecx -; FALLBACK2-NEXT: andl $56, %ecx ; FALLBACK2-NEXT: andl $56, %eax ; FALLBACK2-NEXT: movq -120(%rsp,%rax), %rdi -; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r9 -; FALLBACK2-NEXT: shrxq %rcx, %rdi, %rbx +; FALLBACK2-NEXT: movq -112(%rsp,%rax), %r10 +; FALLBACK2-NEXT: shrxq %rcx, %rdi, %r9 ; FALLBACK2-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 ; FALLBACK2-NEXT: movq -104(%rsp,%rax), %rsi ; FALLBACK2-NEXT: shrxq %rcx, %rsi, %r8 -; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r10 -; FALLBACK2-NEXT: shrxq %rcx, %r9, %r11 +; FALLBACK2-NEXT: movq -96(%rsp,%rax), %r11 +; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbx ; FALLBACK2-NEXT: movq -88(%rsp,%rax), %r14 ; FALLBACK2-NEXT: shrxq %rcx, %r14, %r15 -; FALLBACK2-NEXT: shrxq %rcx, %r10, %rbp +; FALLBACK2-NEXT: shrxq %rcx, %r11, %rbp ; FALLBACK2-NEXT: movl %ecx, %r12d +; FALLBACK2-NEXT: andb $56, %r12b ; FALLBACK2-NEXT: notb %r12b -; FALLBACK2-NEXT: addq %r9, %r9 -; FALLBACK2-NEXT: shlxq %r12, %r9, %r9 -; FALLBACK2-NEXT: orq %rbx, %r9 +; FALLBACK2-NEXT: addq %r10, %r10 +; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 +; FALLBACK2-NEXT: orq %r9, %r10 ; FALLBACK2-NEXT: addq %rdi, %rdi ; FALLBACK2-NEXT: shlxq %r12, %rdi, %rdi ; FALLBACK2-NEXT: orq %r13, %rdi -; FALLBACK2-NEXT: movq -80(%rsp,%rax), %rbx -; FALLBACK2-NEXT: shrxq %rcx, %rbx, %r13 +; FALLBACK2-NEXT: movq -80(%rsp,%rax), %r9 +; FALLBACK2-NEXT: shrxq %rcx, %r9, %r13 ; FALLBACK2-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK2-NEXT: sarxq %rcx, %rax, %rcx -; FALLBACK2-NEXT: addq %r10, %r10 -; FALLBACK2-NEXT: shlxq %r12, %r10, %r10 -; FALLBACK2-NEXT: orq %r8, %r10 +; FALLBACK2-NEXT: addq %r11, %r11 +; FALLBACK2-NEXT: shlxq %r12, %r11, %r11 +; FALLBACK2-NEXT: orq %r8, %r11 ; FALLBACK2-NEXT: addq %rsi, %rsi ; FALLBACK2-NEXT: shlxq %r12, %rsi, %rsi -; FALLBACK2-NEXT: orq %r11, %rsi -; FALLBACK2-NEXT: leaq (%rbx,%rbx), %r8 +; FALLBACK2-NEXT: orq %rbx, %rsi +; FALLBACK2-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK2-NEXT: shlxq %r12, %r8, %r8 ; FALLBACK2-NEXT: orq %r15, %r8 ; FALLBACK2-NEXT: addq %r14, %r14 -; FALLBACK2-NEXT: shlxq %r12, %r14, %r11 -; FALLBACK2-NEXT: orq %rbp, %r11 +; FALLBACK2-NEXT: shlxq %r12, %r14, %r9 +; FALLBACK2-NEXT: orq %rbp, %r9 ; FALLBACK2-NEXT: addq %rax, %rax ; FALLBACK2-NEXT: shlxq %r12, %rax, %rax ; FALLBACK2-NEXT: orq %r13, %rax ; FALLBACK2-NEXT: movq %rcx, 56(%rdx) ; FALLBACK2-NEXT: movq %rax, 48(%rdx) -; FALLBACK2-NEXT: movq %r11, 32(%rdx) +; FALLBACK2-NEXT: movq %r9, 32(%rdx) ; FALLBACK2-NEXT: movq %r8, 40(%rdx) ; FALLBACK2-NEXT: movq %rsi, 16(%rdx) -; FALLBACK2-NEXT: movq %r10, 24(%rdx) +; FALLBACK2-NEXT: movq %r11, 24(%rdx) ; FALLBACK2-NEXT: movq %rdi, (%rdx) -; FALLBACK2-NEXT: movq %r9, 8(%rdx) +; FALLBACK2-NEXT: movq %r10, 8(%rdx) ; FALLBACK2-NEXT: addq $8, %rsp ; FALLBACK2-NEXT: popq %rbx ; FALLBACK2-NEXT: popq %r12 @@ -20459,7 +20491,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK3-NEXT: leal (,%rax,8), %ecx -; FALLBACK3-NEXT: andl $56, %ecx ; FALLBACK3-NEXT: andl $56, %eax ; FALLBACK3-NEXT: movq -112(%rsp,%rax), %rdi ; FALLBACK3-NEXT: movq -128(%rsp,%rax), %rsi @@ -20478,17 +20509,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK3-NEXT: shrdq %cl, %r14, %r10 ; FALLBACK3-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK3-NEXT: shrdq %cl, %rax, %r11 -; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax -; FALLBACK3-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK3-NEXT: shrdq %cl, %r9, %rsi +; FALLBACK3-NEXT: sarxq %rcx, %rax, %rax ; FALLBACK3-NEXT: movq %r11, 48(%rdx) +; FALLBACK3-NEXT: movq %rax, 56(%rdx) ; FALLBACK3-NEXT: movq %r10, 32(%rdx) ; FALLBACK3-NEXT: movq %r15, 40(%rdx) ; FALLBACK3-NEXT: movq %rdi, 16(%rdx) ; FALLBACK3-NEXT: movq %rbx, 24(%rdx) ; FALLBACK3-NEXT: movq %rsi, (%rdx) ; FALLBACK3-NEXT: movq %r8, 8(%rdx) -; FALLBACK3-NEXT: movq %rax, 56(%rdx) ; FALLBACK3-NEXT: popq %rbx ; FALLBACK3-NEXT: popq %r14 ; FALLBACK3-NEXT: popq %r15 @@ -20524,13 +20554,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK4-NEXT: leal (,%rdi,8), %eax -; FALLBACK4-NEXT: andl $56, %eax ; FALLBACK4-NEXT: andl $56, %edi ; FALLBACK4-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK4-NEXT: movq -120(%rsp,%rdi), %r9 ; FALLBACK4-NEXT: movl %eax, %ecx ; FALLBACK4-NEXT: shrq %cl, %r10 ; FALLBACK4-NEXT: movl %eax, %esi +; FALLBACK4-NEXT: andb $56, %sil ; FALLBACK4-NEXT: notb %sil ; FALLBACK4-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK4-NEXT: movl %esi, %ecx @@ -20626,7 +20656,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK5-NEXT: leal (,%rax,8), %ecx -; FALLBACK5-NEXT: andl $56, %ecx ; FALLBACK5-NEXT: andl $56, %eax ; FALLBACK5-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK5-NEXT: movq -104(%rsp,%rax), %r9 @@ -20692,9 +20721,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK6-NEXT: leal (,%rax,8), %esi -; FALLBACK6-NEXT: andl $56, %esi ; FALLBACK6-NEXT: andl $56, %eax -; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK6-NEXT: shrxq %rsi, -128(%rsp,%rax), %rbx ; FALLBACK6-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK6-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK6-NEXT: shrxq %rsi, %rdi, %r12 @@ -20703,34 +20731,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK6-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK6-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK6-NEXT: movl %esi, %ebx -; FALLBACK6-NEXT: notb %bl +; FALLBACK6-NEXT: movl %esi, %r11d +; FALLBACK6-NEXT: andb $56, %r11b +; FALLBACK6-NEXT: notb %r11b ; FALLBACK6-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK6-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK6-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK6-NEXT: orq %r11, %r8 -; FALLBACK6-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK6-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK6-NEXT: orq %r12, %r11 +; FALLBACK6-NEXT: shlxq %r11, %r8, %r8 +; FALLBACK6-NEXT: orq %rbx, %r8 +; FALLBACK6-NEXT: leaq (%r13,%r13), %rbx +; FALLBACK6-NEXT: shlxq %r11, %rbx, %rbx +; FALLBACK6-NEXT: orq %r12, %rbx ; FALLBACK6-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK6-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK6-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK6-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK6-NEXT: sarxq %rsi, %rax, %rsi ; FALLBACK6-NEXT: addq %rdi, %rdi -; FALLBACK6-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK6-NEXT: shlxq %r11, %rdi, %rdi ; FALLBACK6-NEXT: orq %r9, %rdi ; FALLBACK6-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK6-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK6-NEXT: shlxq %r11, %r9, %r9 ; FALLBACK6-NEXT: orq %r14, %r9 ; FALLBACK6-NEXT: addq %r10, %r10 -; FALLBACK6-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK6-NEXT: shlxq %r11, %r10, %r10 ; FALLBACK6-NEXT: orq %r15, %r10 ; FALLBACK6-NEXT: addq %rax, %rax -; FALLBACK6-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK6-NEXT: shlxq %r11, %rax, %rax ; FALLBACK6-NEXT: orq %r13, %rax ; FALLBACK6-NEXT: addq %rcx, %rcx -; FALLBACK6-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK6-NEXT: shlxq %r11, %rcx, %rcx ; FALLBACK6-NEXT: orq %rbp, %rcx ; FALLBACK6-NEXT: movq %rsi, 56(%rdx) ; FALLBACK6-NEXT: movq %rcx, 8(%rdx) @@ -20738,7 +20767,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK6-NEXT: movq %r10, 32(%rdx) ; FALLBACK6-NEXT: movq %r9, 40(%rdx) ; FALLBACK6-NEXT: movq %rdi, 16(%rdx) -; FALLBACK6-NEXT: movq %r11, 24(%rdx) +; FALLBACK6-NEXT: movq %rbx, 24(%rdx) ; FALLBACK6-NEXT: movq %r8, (%rdx) ; FALLBACK6-NEXT: addq $8, %rsp ; FALLBACK6-NEXT: popq %rbx @@ -20775,7 +20804,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK7-NEXT: leal (,%rax,8), %ecx -; FALLBACK7-NEXT: andl $56, %ecx ; FALLBACK7-NEXT: andl $56, %eax ; FALLBACK7-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK7-NEXT: movq -104(%rsp,%rax), %r9 @@ -20795,17 +20823,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK7-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK7-NEXT: movq %rax, %r15 ; FALLBACK7-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK7-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK7-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK7-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK7-NEXT: sarxq %rcx, %r11, %rax ; FALLBACK7-NEXT: movq %r15, 8(%rdx) ; FALLBACK7-NEXT: movq %r9, 48(%rdx) +; FALLBACK7-NEXT: movq %rax, 56(%rdx) ; FALLBACK7-NEXT: movq %rdi, 32(%rdx) ; FALLBACK7-NEXT: movq %rbx, 40(%rdx) ; FALLBACK7-NEXT: movq %r8, 16(%rdx) ; FALLBACK7-NEXT: movq %rsi, 24(%rdx) ; FALLBACK7-NEXT: movq %r14, (%rdx) -; FALLBACK7-NEXT: movq %r10, 56(%rdx) ; FALLBACK7-NEXT: popq %rbx ; FALLBACK7-NEXT: popq %r14 ; FALLBACK7-NEXT: popq %r15 @@ -20839,13 +20866,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK8-NEXT: leal (,%rdi,8), %eax -; FALLBACK8-NEXT: andl $56, %eax ; FALLBACK8-NEXT: andl $56, %edi ; FALLBACK8-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK8-NEXT: movq -120(%rsp,%rdi), %r9 ; FALLBACK8-NEXT: movl %eax, %ecx ; FALLBACK8-NEXT: shrq %cl, %r10 ; FALLBACK8-NEXT: movl %eax, %esi +; FALLBACK8-NEXT: andb $56, %sil ; FALLBACK8-NEXT: notb %sil ; FALLBACK8-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK8-NEXT: movl %esi, %ecx @@ -20940,7 +20967,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK9-NEXT: leal (,%rax,8), %ecx -; FALLBACK9-NEXT: andl $56, %ecx ; FALLBACK9-NEXT: andl $56, %eax ; FALLBACK9-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK9-NEXT: movq -104(%rsp,%rax), %r9 @@ -21005,9 +21031,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK10-NEXT: leal (,%rax,8), %esi -; FALLBACK10-NEXT: andl $56, %esi ; FALLBACK10-NEXT: andl $56, %eax -; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK10-NEXT: shrxq %rsi, -128(%rsp,%rax), %rbx ; FALLBACK10-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK10-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK10-NEXT: shrxq %rsi, %rdi, %r12 @@ -21016,34 +21041,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK10-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK10-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK10-NEXT: movl %esi, %ebx -; FALLBACK10-NEXT: notb %bl +; FALLBACK10-NEXT: movl %esi, %r11d +; FALLBACK10-NEXT: andb $56, %r11b +; FALLBACK10-NEXT: notb %r11b ; FALLBACK10-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK10-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK10-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK10-NEXT: orq %r11, %r8 -; FALLBACK10-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK10-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK10-NEXT: orq %r12, %r11 +; FALLBACK10-NEXT: shlxq %r11, %r8, %r8 +; FALLBACK10-NEXT: orq %rbx, %r8 +; FALLBACK10-NEXT: leaq (%r13,%r13), %rbx +; FALLBACK10-NEXT: shlxq %r11, %rbx, %rbx +; FALLBACK10-NEXT: orq %r12, %rbx ; FALLBACK10-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK10-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK10-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK10-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK10-NEXT: sarxq %rsi, %rax, %rsi ; FALLBACK10-NEXT: addq %rdi, %rdi -; FALLBACK10-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK10-NEXT: shlxq %r11, %rdi, %rdi ; FALLBACK10-NEXT: orq %r9, %rdi ; FALLBACK10-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK10-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK10-NEXT: shlxq %r11, %r9, %r9 ; FALLBACK10-NEXT: orq %r14, %r9 ; FALLBACK10-NEXT: addq %r10, %r10 -; FALLBACK10-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK10-NEXT: shlxq %r11, %r10, %r10 ; FALLBACK10-NEXT: orq %r15, %r10 ; FALLBACK10-NEXT: addq %rax, %rax -; FALLBACK10-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK10-NEXT: shlxq %r11, %rax, %rax ; FALLBACK10-NEXT: orq %r13, %rax ; FALLBACK10-NEXT: addq %rcx, %rcx -; FALLBACK10-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK10-NEXT: shlxq %r11, %rcx, %rcx ; FALLBACK10-NEXT: orq %rbp, %rcx ; FALLBACK10-NEXT: movq %rsi, 56(%rdx) ; FALLBACK10-NEXT: movq %rcx, 8(%rdx) @@ -21051,7 +21077,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK10-NEXT: movq %r10, 32(%rdx) ; FALLBACK10-NEXT: movq %r9, 40(%rdx) ; FALLBACK10-NEXT: movq %rdi, 16(%rdx) -; FALLBACK10-NEXT: movq %r11, 24(%rdx) +; FALLBACK10-NEXT: movq %rbx, 24(%rdx) ; FALLBACK10-NEXT: movq %r8, (%rdx) ; FALLBACK10-NEXT: addq $8, %rsp ; FALLBACK10-NEXT: popq %rbx @@ -21087,7 +21113,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK11-NEXT: leal (,%rax,8), %ecx -; FALLBACK11-NEXT: andl $56, %ecx ; FALLBACK11-NEXT: andl $56, %eax ; FALLBACK11-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK11-NEXT: movq -104(%rsp,%rax), %r9 @@ -21107,17 +21132,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK11-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK11-NEXT: movq %rax, %r15 ; FALLBACK11-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK11-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK11-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK11-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK11-NEXT: sarxq %rcx, %r11, %rax ; FALLBACK11-NEXT: movq %r15, 8(%rdx) ; FALLBACK11-NEXT: movq %r9, 48(%rdx) +; FALLBACK11-NEXT: movq %rax, 56(%rdx) ; FALLBACK11-NEXT: movq %rdi, 32(%rdx) ; FALLBACK11-NEXT: movq %rbx, 40(%rdx) ; FALLBACK11-NEXT: movq %r8, 16(%rdx) ; FALLBACK11-NEXT: movq %rsi, 24(%rdx) ; FALLBACK11-NEXT: movq %r14, (%rdx) -; FALLBACK11-NEXT: movq %r10, 56(%rdx) ; FALLBACK11-NEXT: popq %rbx ; FALLBACK11-NEXT: popq %r14 ; FALLBACK11-NEXT: popq %r15 @@ -21152,13 +21176,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; FALLBACK12-NEXT: leal (,%rdi,8), %eax -; FALLBACK12-NEXT: andl $56, %eax ; FALLBACK12-NEXT: andl $56, %edi ; FALLBACK12-NEXT: movq -128(%rsp,%rdi), %r10 ; FALLBACK12-NEXT: movq -120(%rsp,%rdi), %r9 ; FALLBACK12-NEXT: movl %eax, %ecx ; FALLBACK12-NEXT: shrq %cl, %r10 ; FALLBACK12-NEXT: movl %eax, %esi +; FALLBACK12-NEXT: andb $56, %sil ; FALLBACK12-NEXT: notb %sil ; FALLBACK12-NEXT: leaq (%r9,%r9), %r8 ; FALLBACK12-NEXT: movl %esi, %ecx @@ -21253,7 +21277,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK13-NEXT: leal (,%rax,8), %ecx -; FALLBACK13-NEXT: andl $56, %ecx ; FALLBACK13-NEXT: andl $56, %eax ; FALLBACK13-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK13-NEXT: movq -104(%rsp,%rax), %r9 @@ -21318,9 +21341,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK14-NEXT: leal (,%rax,8), %esi -; FALLBACK14-NEXT: andl $56, %esi ; FALLBACK14-NEXT: andl $56, %eax -; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %r11 +; FALLBACK14-NEXT: shrxq %rsi, -128(%rsp,%rax), %rbx ; FALLBACK14-NEXT: movq -112(%rsp,%rax), %rcx ; FALLBACK14-NEXT: movq -104(%rsp,%rax), %rdi ; FALLBACK14-NEXT: shrxq %rsi, %rdi, %r12 @@ -21329,34 +21351,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq -88(%rsp,%rax), %r10 ; FALLBACK14-NEXT: shrxq %rsi, %r10, %r14 ; FALLBACK14-NEXT: shrxq %rsi, %r13, %r15 -; FALLBACK14-NEXT: movl %esi, %ebx -; FALLBACK14-NEXT: notb %bl +; FALLBACK14-NEXT: movl %esi, %r11d +; FALLBACK14-NEXT: andb $56, %r11b +; FALLBACK14-NEXT: notb %r11b ; FALLBACK14-NEXT: movq -120(%rsp,%rax), %rbp ; FALLBACK14-NEXT: leaq (%rbp,%rbp), %r8 -; FALLBACK14-NEXT: shlxq %rbx, %r8, %r8 -; FALLBACK14-NEXT: orq %r11, %r8 -; FALLBACK14-NEXT: leaq (%r13,%r13), %r11 -; FALLBACK14-NEXT: shlxq %rbx, %r11, %r11 -; FALLBACK14-NEXT: orq %r12, %r11 +; FALLBACK14-NEXT: shlxq %r11, %r8, %r8 +; FALLBACK14-NEXT: orq %rbx, %r8 +; FALLBACK14-NEXT: leaq (%r13,%r13), %rbx +; FALLBACK14-NEXT: shlxq %r11, %rbx, %rbx +; FALLBACK14-NEXT: orq %r12, %rbx ; FALLBACK14-NEXT: movq -80(%rsp,%rax), %r12 ; FALLBACK14-NEXT: shrxq %rsi, %r12, %r13 ; FALLBACK14-NEXT: shrxq %rsi, %rbp, %rbp ; FALLBACK14-NEXT: movq -72(%rsp,%rax), %rax ; FALLBACK14-NEXT: sarxq %rsi, %rax, %rsi ; FALLBACK14-NEXT: addq %rdi, %rdi -; FALLBACK14-NEXT: shlxq %rbx, %rdi, %rdi +; FALLBACK14-NEXT: shlxq %r11, %rdi, %rdi ; FALLBACK14-NEXT: orq %r9, %rdi ; FALLBACK14-NEXT: leaq (%r12,%r12), %r9 -; FALLBACK14-NEXT: shlxq %rbx, %r9, %r9 +; FALLBACK14-NEXT: shlxq %r11, %r9, %r9 ; FALLBACK14-NEXT: orq %r14, %r9 ; FALLBACK14-NEXT: addq %r10, %r10 -; FALLBACK14-NEXT: shlxq %rbx, %r10, %r10 +; FALLBACK14-NEXT: shlxq %r11, %r10, %r10 ; FALLBACK14-NEXT: orq %r15, %r10 ; FALLBACK14-NEXT: addq %rax, %rax -; FALLBACK14-NEXT: shlxq %rbx, %rax, %rax +; FALLBACK14-NEXT: shlxq %r11, %rax, %rax ; FALLBACK14-NEXT: orq %r13, %rax ; FALLBACK14-NEXT: addq %rcx, %rcx -; FALLBACK14-NEXT: shlxq %rbx, %rcx, %rcx +; FALLBACK14-NEXT: shlxq %r11, %rcx, %rcx ; FALLBACK14-NEXT: orq %rbp, %rcx ; FALLBACK14-NEXT: movq %rsi, 56(%rdx) ; FALLBACK14-NEXT: movq %rcx, 8(%rdx) @@ -21364,7 +21387,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK14-NEXT: movq %r10, 32(%rdx) ; FALLBACK14-NEXT: movq %r9, 40(%rdx) ; FALLBACK14-NEXT: movq %rdi, 16(%rdx) -; FALLBACK14-NEXT: movq %r11, 24(%rdx) +; FALLBACK14-NEXT: movq %rbx, 24(%rdx) ; FALLBACK14-NEXT: movq %r8, (%rdx) ; FALLBACK14-NEXT: addq $8, %rsp ; FALLBACK14-NEXT: popq %rbx @@ -21400,7 +21423,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; FALLBACK15-NEXT: leal (,%rax,8), %ecx -; FALLBACK15-NEXT: andl $56, %ecx ; FALLBACK15-NEXT: andl $56, %eax ; FALLBACK15-NEXT: movq -96(%rsp,%rax), %rdi ; FALLBACK15-NEXT: movq -104(%rsp,%rax), %r9 @@ -21420,17 +21442,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK15-NEXT: movq -120(%rsp,%rax), %rax ; FALLBACK15-NEXT: movq %rax, %r15 ; FALLBACK15-NEXT: shrdq %cl, %r10, %r15 -; FALLBACK15-NEXT: sarxq %rcx, %r11, %r10 -; FALLBACK15-NEXT: # kill: def $cl killed $cl killed $rcx ; FALLBACK15-NEXT: shrdq %cl, %rax, %r14 +; FALLBACK15-NEXT: sarxq %rcx, %r11, %rax ; FALLBACK15-NEXT: movq %r15, 8(%rdx) ; FALLBACK15-NEXT: movq %r9, 48(%rdx) +; FALLBACK15-NEXT: movq %rax, 56(%rdx) ; FALLBACK15-NEXT: movq %rdi, 32(%rdx) ; FALLBACK15-NEXT: movq %rbx, 40(%rdx) ; FALLBACK15-NEXT: movq %r8, 16(%rdx) ; FALLBACK15-NEXT: movq %rsi, 24(%rdx) ; FALLBACK15-NEXT: movq %r14, (%rdx) -; FALLBACK15-NEXT: movq %r10, 56(%rdx) ; FALLBACK15-NEXT: popq %rbx ; FALLBACK15-NEXT: popq %r14 ; FALLBACK15-NEXT: popq %r15 @@ -21516,46 +21537,46 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK16-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK16-NEXT: movl %ebp, %ecx -; FALLBACK16-NEXT: movl %ebp, %esi -; FALLBACK16-NEXT: andl $60, %esi -; FALLBACK16-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK16-NEXT: shll $3, %ecx -; FALLBACK16-NEXT: andl $24, %ecx -; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movl %ebp, %eax +; FALLBACK16-NEXT: andl $60, %eax +; FALLBACK16-NEXT: movl 68(%esp,%eax), %edi +; FALLBACK16-NEXT: movl %eax, %ebx +; FALLBACK16-NEXT: shll $3, %edx +; FALLBACK16-NEXT: movl %edi, %eax +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 72(%esp,%esi), %edi -; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: addl %edi, %edi -; FALLBACK16-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK16-NEXT: movl %ecx, %ebx -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: movl 72(%esp,%ebx), %ecx +; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK16-NEXT: movb %dl, %ch +; FALLBACK16-NEXT: andb $24, %ch ; FALLBACK16-NEXT: notb %ch ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %eax, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK16-NEXT: movb %dl, %cl +; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: addl %edi, %edi +; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %eax, %edi ; FALLBACK16-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 64(%esp,%esi), %eax -; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, %eax +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: addl %edx, %edx +; FALLBACK16-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK16-NEXT: leal (%edi,%edi), %esi ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx +; FALLBACK16-NEXT: shll %cl, %esi +; FALLBACK16-NEXT: orl %eax, %esi +; FALLBACK16-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 76(%esp,%esi), %ebp -; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %edx -; FALLBACK16-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK16-NEXT: leal (%edi,%edi), %eax -; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %eax -; FALLBACK16-NEXT: orl %edx, %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; FALLBACK16-NEXT: shrl %cl, %eax ; FALLBACK16-NEXT: addl %ebp, %ebp @@ -21563,30 +21584,28 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %esi, %edx -; FALLBACK16-NEXT: movl 84(%esp,%esi), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 84(%esp,%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, %eax +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 88(%esp,%esi), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp +; FALLBACK16-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %edx ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: shll %cl, %edx +; FALLBACK16-NEXT: orl %eax, %edx +; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl 92(%esp,%edx), %ebp +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK16-NEXT: movl 92(%esp,%eax), %ebp ; FALLBACK16-NEXT: movl %ebp, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; FALLBACK16-NEXT: movb %bl, %cl ; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: movl 96(%esp,%eax), %edi @@ -21596,87 +21615,85 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK16-NEXT: orl %edx, %eax ; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movl %ebx, %eax ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %esi, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 100(%esp,%edx), %eax -; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 104(%esp,%edx), %esi -; FALLBACK16-NEXT: leal (%esi,%esi), %ebp +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movl 100(%esp,%ebx), %ebp +; FALLBACK16-NEXT: movl %ebp, %edx +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl 104(%esp,%ebx), %esi +; FALLBACK16-NEXT: leal (%esi,%esi), %eax ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebp -; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebx, %edx +; FALLBACK16-NEXT: shll %cl, %eax +; FALLBACK16-NEXT: orl %edx, %eax +; FALLBACK16-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %edi -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK16-NEXT: addl %ebx, %ebx +; FALLBACK16-NEXT: addl %ebp, %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %ebx -; FALLBACK16-NEXT: orl %edi, %ebx -; FALLBACK16-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; FALLBACK16-NEXT: movl 108(%esp,%ebp), %edi +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edi, %ebp +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl 108(%esp,%ebx), %edi ; FALLBACK16-NEXT: movl %edi, %eax -; FALLBACK16-NEXT: movl %edx, %ebx -; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: movl %edx, %ecx ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 112(%esp,%ebp), %ecx +; FALLBACK16-NEXT: movl 112(%esp,%ebx), %ecx ; FALLBACK16-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movl %ebp, %edx ; FALLBACK16-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp ; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK16-NEXT: movb %bl, %cl +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %esi ; FALLBACK16-NEXT: addl %edi, %edi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %edi ; FALLBACK16-NEXT: orl %esi, %edi -; FALLBACK16-NEXT: movl 116(%esp,%edx), %esi +; FALLBACK16-NEXT: movl 116(%esp,%ebx), %esi ; FALLBACK16-NEXT: movl %esi, %eax -; FALLBACK16-NEXT: movl %ebx, %ecx +; FALLBACK16-NEXT: movb %dl, %cl ; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl 120(%esp,%edx), %edx -; FALLBACK16-NEXT: leal (%edx,%edx), %ebp -; FALLBACK16-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; FALLBACK16-NEXT: movl 120(%esp,%ebx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %ebp ; FALLBACK16-NEXT: orl %eax, %ebp -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK16-NEXT: shrl %cl, %eax +; FALLBACK16-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK16-NEXT: movl %edx, %eax +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK16-NEXT: shrl %cl, %edx ; FALLBACK16-NEXT: addl %esi, %esi ; FALLBACK16-NEXT: movb %ch, %cl ; FALLBACK16-NEXT: shll %cl, %esi -; FALLBACK16-NEXT: orl %eax, %esi -; FALLBACK16-NEXT: movb %bl, %cl -; FALLBACK16-NEXT: movl %edx, %eax -; FALLBACK16-NEXT: shrl %cl, %eax -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK16-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK16-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK16-NEXT: orl %edx, %esi +; FALLBACK16-NEXT: movb %al, %cl +; FALLBACK16-NEXT: movl %ebx, %edx +; FALLBACK16-NEXT: shrl %cl, %edx +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK16-NEXT: movl 124(%esp,%ebx), %ebx +; FALLBACK16-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK16-NEXT: movb %ch, %cl -; FALLBACK16-NEXT: shll %cl, %edx -; FALLBACK16-NEXT: orl %eax, %edx -; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK16-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK16-NEXT: shll %cl, %ebp +; FALLBACK16-NEXT: orl %edx, %ebp +; FALLBACK16-NEXT: movl %eax, %ecx ; FALLBACK16-NEXT: sarl %cl, %ebx ; FALLBACK16-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK16-NEXT: movl %ebx, 60(%eax) -; FALLBACK16-NEXT: movl %edx, 56(%eax) +; FALLBACK16-NEXT: movl %ebp, 56(%eax) ; FALLBACK16-NEXT: movl %esi, 48(%eax) -; FALLBACK16-NEXT: movl %ebp, 52(%eax) +; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK16-NEXT: movl %ecx, 52(%eax) ; FALLBACK16-NEXT: movl %edi, 40(%eax) ; FALLBACK16-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK16-NEXT: movl %ecx, 44(%eax) @@ -21786,91 +21803,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK17-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK17-NEXT: movl %ecx, %ebp -; FALLBACK17-NEXT: andl $60, %ebp -; FALLBACK17-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shll $3, %ecx -; FALLBACK17-NEXT: andl $24, %ecx -; FALLBACK17-NEXT: shrdl %cl, %edx, %eax -; FALLBACK17-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %esi -; FALLBACK17-NEXT: shrdl %cl, %edi, %esi -; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx +; FALLBACK17-NEXT: movl %ecx, %eax +; FALLBACK17-NEXT: andl $60, %eax +; FALLBACK17-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK17-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx +; FALLBACK17-NEXT: shll $3, %ecx ; FALLBACK17-NEXT: shrdl %cl, %esi, %edx ; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi +; FALLBACK17-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK17-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %edi +; FALLBACK17-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK17-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %edi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK17-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edx -; FALLBACK17-NEXT: shrdl %cl, %esi, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl %esi, %edx -; FALLBACK17-NEXT: shrdl %cl, %eax, %edi -; FALLBACK17-NEXT: movl %edi, (%esp) # 4-byte Spill -; FALLBACK17-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK17-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi -; FALLBACK17-NEXT: shrdl %cl, %esi, %edi -; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK17-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK17-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK17-NEXT: movl %eax, %edi ; FALLBACK17-NEXT: shrdl %cl, %edx, %edi -; FALLBACK17-NEXT: shrdl %cl, %eax, %esi -; FALLBACK17-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK17-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK17-NEXT: shrdl %cl, %eax, %edx -; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK17-NEXT: movl %edx, 56(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK17-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %esi +; FALLBACK17-NEXT: shrdl %cl, %edi, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl %edi, %esi ; FALLBACK17-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK17-NEXT: movl %ebx, (%esp) # 4-byte Spill +; FALLBACK17-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK17-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %ebx +; FALLBACK17-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK17-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK17-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK17-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK17-NEXT: movl %edx, %ebx +; FALLBACK17-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK17-NEXT: shrdl %cl, %edx, %edi +; FALLBACK17-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK17-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK17-NEXT: shrdl %cl, %edx, %esi +; FALLBACK17-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK17-NEXT: movl %esi, 56(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK17-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK17-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK17-NEXT: sarl %cl, %eax -; FALLBACK17-NEXT: movl %eax, 60(%ebp) -; FALLBACK17-NEXT: movl %esi, 48(%ebp) -; FALLBACK17-NEXT: movl %edi, 52(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 40(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 44(%ebp) -; FALLBACK17-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 32(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 36(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 24(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 28(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 16(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 20(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 8(%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 12(%ebp) -; FALLBACK17-NEXT: movl %ebx, (%ebp) -; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK17-NEXT: movl %eax, 4(%ebp) +; FALLBACK17-NEXT: sarl %cl, %edx +; FALLBACK17-NEXT: movl %edx, 60(%eax) +; FALLBACK17-NEXT: movl %edi, 48(%eax) +; FALLBACK17-NEXT: movl %ebx, 52(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 40(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 44(%eax) +; FALLBACK17-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 32(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 36(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 24(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 28(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 16(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 20(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 8(%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 12(%eax) +; FALLBACK17-NEXT: movl %ebp, (%eax) +; FALLBACK17-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK17-NEXT: movl %ecx, 4(%eax) ; FALLBACK17-NEXT: addl $188, %esp ; FALLBACK17-NEXT: popl %esi ; FALLBACK17-NEXT: popl %edi @@ -21959,7 +21975,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK18-NEXT: movl %eax, %ecx ; FALLBACK18-NEXT: leal (,%eax,8), %edx -; FALLBACK18-NEXT: andl $24, %edx ; FALLBACK18-NEXT: andl $60, %ecx ; FALLBACK18-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK18-NEXT: movl 72(%esp,%ecx), %edi @@ -21967,6 +21982,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK18-NEXT: shrxl %edx, %esi, %eax ; FALLBACK18-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK18-NEXT: movl %edx, %ebx +; FALLBACK18-NEXT: andb $24, %bl ; FALLBACK18-NEXT: notb %bl ; FALLBACK18-NEXT: leal (%edi,%edi), %ebp ; FALLBACK18-NEXT: shlxl %ebx, %ebp, %eax @@ -22179,7 +22195,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK19-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: shll $3, %ecx -; FALLBACK19-NEXT: andl $24, %ecx ; FALLBACK19-NEXT: shrdl %cl, %edx, %eax ; FALLBACK19-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK19-NEXT: movl 64(%esp,%ebp), %edi @@ -22304,159 +22319,162 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK20-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK20-NEXT: movl %eax, %esi -; FALLBACK20-NEXT: andl $60, %esi -; FALLBACK20-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK20-NEXT: shll $3, %eax -; FALLBACK20-NEXT: andl $24, %eax -; FALLBACK20-NEXT: movl %edx, %edi -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK20-NEXT: movl %eax, %edx +; FALLBACK20-NEXT: andl $60, %eax +; FALLBACK20-NEXT: movl 68(%esp,%eax), %edi +; FALLBACK20-NEXT: movl %eax, %ebx +; FALLBACK20-NEXT: shll $3, %edx +; FALLBACK20-NEXT: movl %edi, %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 72(%esp,%ebx), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK20-NEXT: movb %al, %ch +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK20-NEXT: movb %dl, %ch +; FALLBACK20-NEXT: andb $24, %ch ; FALLBACK20-NEXT: notb %ch ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %eax, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: addl %edi, %edi ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %edi, %edx +; FALLBACK20-NEXT: shll %cl, %edi +; FALLBACK20-NEXT: orl %eax, %edi +; FALLBACK20-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, %eax +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK20-NEXT: leal (%edi,%edi), %esi +; FALLBACK20-NEXT: movb %ch, %cl +; FALLBACK20-NEXT: shll %cl, %esi +; FALLBACK20-NEXT: orl %eax, %esi +; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK20-NEXT: movl %edx, %ebp -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK20-NEXT: leal (%edi,%edi), %ebx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %eax, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: addl %edx, %edx +; FALLBACK20-NEXT: movl 84(%esp,%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, %eax +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK20-NEXT: leal (%esi,%esi), %edx ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %ebx, %edx +; FALLBACK20-NEXT: orl %eax, %edx ; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movl %eax, %edx -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: addl %eax, %eax -; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax -; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: movb %bl, %cl ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK20-NEXT: movl 92(%esp,%eax), %ebp +; FALLBACK20-NEXT: movl %ebp, %edx +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl 96(%esp,%eax), %edi ; FALLBACK20-NEXT: leal (%edi,%edi), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: orl %edx, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: movb %bl, %cl +; FALLBACK20-NEXT: movl %ebx, %eax +; FALLBACK20-NEXT: shrl %cl, %esi +; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %eax, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK20-NEXT: movl %ebx, %ebp -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK20-NEXT: leal (%edx,%edx), %eax +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %esi, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: movl 100(%esp,%ebx), %ebp +; FALLBACK20-NEXT: movl %ebp, %edx +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl 104(%esp,%ebx), %esi +; FALLBACK20-NEXT: leal (%esi,%esi), %eax ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %eax -; FALLBACK20-NEXT: orl %ebp, %eax +; FALLBACK20-NEXT: orl %edx, %eax ; FALLBACK20-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK20-NEXT: movb %dl, %cl ; FALLBACK20-NEXT: shrl %cl, %edi -; FALLBACK20-NEXT: addl %ebx, %ebx +; FALLBACK20-NEXT: addl %ebp, %ebp ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %edi, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK20-NEXT: movl %edi, %ebp -; FALLBACK20-NEXT: movl %eax, %ecx -; FALLBACK20-NEXT: shrl %cl, %ebp -; FALLBACK20-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %edi, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl 108(%esp,%ebx), %edi +; FALLBACK20-NEXT: movl %edi, %eax +; FALLBACK20-NEXT: movl %edx, %ecx +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 112(%esp,%ebx), %ecx ; FALLBACK20-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK20-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK20-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %ebx -; FALLBACK20-NEXT: orl %ebp, %ebx -; FALLBACK20-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %eax, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %esi ; FALLBACK20-NEXT: addl %edi, %edi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %edi -; FALLBACK20-NEXT: orl %edx, %edi -; FALLBACK20-NEXT: movl %esi, %edx -; FALLBACK20-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK20-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK20-NEXT: movl %esi, %ebx -; FALLBACK20-NEXT: movb %al, %cl -; FALLBACK20-NEXT: shrl %cl, %ebx -; FALLBACK20-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK20-NEXT: leal (%eax,%eax), %ebp +; FALLBACK20-NEXT: orl %esi, %edi +; FALLBACK20-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK20-NEXT: movl %esi, %eax +; FALLBACK20-NEXT: movb %dl, %cl +; FALLBACK20-NEXT: shrl %cl, %eax +; FALLBACK20-NEXT: movl 120(%esp,%ebx), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %ebp -; FALLBACK20-NEXT: orl %ebx, %ebp +; FALLBACK20-NEXT: orl %eax, %ebp +; FALLBACK20-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK20-NEXT: movl %edx, %eax +; FALLBACK20-NEXT: movb %al, %cl ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK20-NEXT: shrl %cl, %ebx +; FALLBACK20-NEXT: shrl %cl, %edx ; FALLBACK20-NEXT: addl %esi, %esi ; FALLBACK20-NEXT: movb %ch, %cl ; FALLBACK20-NEXT: shll %cl, %esi -; FALLBACK20-NEXT: orl %ebx, %esi -; FALLBACK20-NEXT: movb %dl, %cl -; FALLBACK20-NEXT: shrl %cl, %eax -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK20-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK20-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK20-NEXT: orl %edx, %esi +; FALLBACK20-NEXT: movb %al, %cl +; FALLBACK20-NEXT: movl %ebx, %edx +; FALLBACK20-NEXT: shrl %cl, %edx +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK20-NEXT: movl 124(%esp,%ebx), %ebx +; FALLBACK20-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK20-NEXT: movb %ch, %cl -; FALLBACK20-NEXT: shll %cl, %edx -; FALLBACK20-NEXT: orl %eax, %edx -; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK20-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK20-NEXT: shll %cl, %ebp +; FALLBACK20-NEXT: orl %edx, %ebp +; FALLBACK20-NEXT: movl %eax, %ecx ; FALLBACK20-NEXT: sarl %cl, %ebx ; FALLBACK20-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK20-NEXT: movl %ebx, 60(%eax) -; FALLBACK20-NEXT: movl %edx, 56(%eax) +; FALLBACK20-NEXT: movl %ebp, 56(%eax) ; FALLBACK20-NEXT: movl %esi, 48(%eax) -; FALLBACK20-NEXT: movl %ebp, 52(%eax) +; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK20-NEXT: movl %ecx, 52(%eax) ; FALLBACK20-NEXT: movl %edi, 40(%eax) ; FALLBACK20-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK20-NEXT: movl %ecx, 44(%eax) @@ -22528,91 +22546,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK21-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK21-NEXT: movl %ecx, %ebp -; FALLBACK21-NEXT: andl $60, %ebp -; FALLBACK21-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shll $3, %ecx -; FALLBACK21-NEXT: andl $24, %ecx -; FALLBACK21-NEXT: shrdl %cl, %edx, %eax -; FALLBACK21-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %esi -; FALLBACK21-NEXT: shrdl %cl, %edi, %esi -; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx +; FALLBACK21-NEXT: movl %ecx, %eax +; FALLBACK21-NEXT: andl $60, %eax +; FALLBACK21-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK21-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx +; FALLBACK21-NEXT: shll $3, %ecx ; FALLBACK21-NEXT: shrdl %cl, %esi, %edx ; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi +; FALLBACK21-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK21-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %edi +; FALLBACK21-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK21-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %edi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi +; FALLBACK21-NEXT: shrdl %cl, %edx, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK21-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %edi, %esi ; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edx -; FALLBACK21-NEXT: shrdl %cl, %esi, %edx -; FALLBACK21-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl %esi, %edx -; FALLBACK21-NEXT: shrdl %cl, %eax, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK21-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %esi, %edi -; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK21-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK21-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK21-NEXT: movl %eax, %edi -; FALLBACK21-NEXT: shrdl %cl, %edx, %edi -; FALLBACK21-NEXT: shrdl %cl, %eax, %esi -; FALLBACK21-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK21-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK21-NEXT: shrdl %cl, %eax, %edx -; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK21-NEXT: movl %edx, 56(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx -; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK21-NEXT: sarl %cl, %eax -; FALLBACK21-NEXT: movl %eax, 60(%ebp) -; FALLBACK21-NEXT: movl %esi, 48(%ebp) -; FALLBACK21-NEXT: movl %edi, 52(%ebp) -; FALLBACK21-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 40(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 44(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 32(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 36(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 24(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 28(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 16(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 20(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 8(%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 12(%ebp) -; FALLBACK21-NEXT: movl %ebx, (%ebp) -; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK21-NEXT: movl %eax, 4(%ebp) +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK21-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %ebx, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK21-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %esi +; FALLBACK21-NEXT: shrdl %cl, %edi, %esi +; FALLBACK21-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl %edi, %esi +; FALLBACK21-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK21-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %ebx +; FALLBACK21-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK21-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK21-NEXT: shrdl %cl, %edx, %esi +; FALLBACK21-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK21-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK21-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK21-NEXT: movl %edx, %ebx +; FALLBACK21-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK21-NEXT: shrdl %cl, %edx, %edi +; FALLBACK21-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK21-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK21-NEXT: shrdl %cl, %edx, %esi +; FALLBACK21-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK21-NEXT: movl %esi, 56(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK21-NEXT: shrdl %cl, %esi, %ebp +; FALLBACK21-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK21-NEXT: sarl %cl, %edx +; FALLBACK21-NEXT: movl %edx, 60(%eax) +; FALLBACK21-NEXT: movl %edi, 48(%eax) +; FALLBACK21-NEXT: movl %ebx, 52(%eax) +; FALLBACK21-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 40(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 44(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 32(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 36(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 24(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 28(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 16(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 20(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 8(%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 12(%eax) +; FALLBACK21-NEXT: movl %ebp, (%eax) +; FALLBACK21-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK21-NEXT: movl %ecx, 4(%eax) ; FALLBACK21-NEXT: addl $188, %esp ; FALLBACK21-NEXT: popl %esi ; FALLBACK21-NEXT: popl %edi @@ -22663,7 +22680,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK22-NEXT: movl %eax, %ecx ; FALLBACK22-NEXT: leal (,%eax,8), %edx -; FALLBACK22-NEXT: andl $24, %edx ; FALLBACK22-NEXT: andl $60, %ecx ; FALLBACK22-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK22-NEXT: movl 72(%esp,%ecx), %edi @@ -22671,6 +22687,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK22-NEXT: shrxl %edx, %esi, %eax ; FALLBACK22-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK22-NEXT: movl %edx, %ebx +; FALLBACK22-NEXT: andb $24, %bl ; FALLBACK22-NEXT: notb %bl ; FALLBACK22-NEXT: leal (%edi,%edi), %ebp ; FALLBACK22-NEXT: shlxl %ebx, %ebp, %eax @@ -22845,7 +22862,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK23-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: shll $3, %ecx -; FALLBACK23-NEXT: andl $24, %ecx ; FALLBACK23-NEXT: shrdl %cl, %edx, %eax ; FALLBACK23-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK23-NEXT: movl 64(%esp,%ebp), %edi @@ -22936,191 +22952,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK24-NEXT: pushl %edi ; FALLBACK24-NEXT: pushl %esi ; FALLBACK24-NEXT: subl $204, %esp -; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK24-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK24-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK24-NEXT: movl 48(%ecx), %edx -; FALLBACK24-NEXT: movl 52(%ecx), %esi -; FALLBACK24-NEXT: movl 56(%ecx), %edi -; FALLBACK24-NEXT: movl 60(%ecx), %ecx -; FALLBACK24-NEXT: movl (%eax), %eax -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK24-NEXT: vmovups (%eax), %ymm0 +; FALLBACK24-NEXT: vmovups 32(%eax), %xmm1 +; FALLBACK24-NEXT: movl 48(%eax), %edx +; FALLBACK24-NEXT: movl 52(%eax), %esi +; FALLBACK24-NEXT: movl 56(%eax), %edi +; FALLBACK24-NEXT: movl 60(%eax), %eax +; FALLBACK24-NEXT: movl (%ecx), %ecx +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK24-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: sarl $31, %ecx -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK24-NEXT: movl %eax, %esi -; FALLBACK24-NEXT: andl $60, %esi -; FALLBACK24-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK24-NEXT: shll $3, %eax -; FALLBACK24-NEXT: andl $24, %eax -; FALLBACK24-NEXT: movl %edx, %edi -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK24-NEXT: sarl $31, %eax +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK24-NEXT: movl %ecx, %edx +; FALLBACK24-NEXT: movl %ecx, %eax +; FALLBACK24-NEXT: andl $60, %eax +; FALLBACK24-NEXT: movl 68(%esp,%eax), %edi +; FALLBACK24-NEXT: movl %eax, %ebx +; FALLBACK24-NEXT: shll $3, %edx +; FALLBACK24-NEXT: movl %edi, %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 72(%esp,%ebx), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK24-NEXT: movb %al, %ch +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK24-NEXT: movb %dl, %ch +; FALLBACK24-NEXT: andb $24, %ch ; FALLBACK24-NEXT: notb %ch ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %eax, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: addl %edi, %edi ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %edi, %edx +; FALLBACK24-NEXT: shll %cl, %edi +; FALLBACK24-NEXT: orl %eax, %edi +; FALLBACK24-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, %eax +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK24-NEXT: leal (%edi,%edi), %esi +; FALLBACK24-NEXT: movb %ch, %cl +; FALLBACK24-NEXT: shll %cl, %esi +; FALLBACK24-NEXT: orl %eax, %esi +; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK24-NEXT: movl %edx, %ebp -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK24-NEXT: leal (%edi,%edi), %ebx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %eax, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: addl %edx, %edx +; FALLBACK24-NEXT: movl 84(%esp,%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, %eax +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK24-NEXT: leal (%esi,%esi), %edx ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %ebx, %edx +; FALLBACK24-NEXT: orl %eax, %edx ; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movl %eax, %edx -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: addl %eax, %eax -; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax -; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: movb %bl, %cl ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK24-NEXT: movl 92(%esp,%eax), %ebp +; FALLBACK24-NEXT: movl %ebp, %edx +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl 96(%esp,%eax), %edi ; FALLBACK24-NEXT: leal (%edi,%edi), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: orl %edx, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: movb %bl, %cl +; FALLBACK24-NEXT: movl %ebx, %eax +; FALLBACK24-NEXT: shrl %cl, %esi +; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %eax, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK24-NEXT: movl %ebx, %ebp -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK24-NEXT: leal (%edx,%edx), %eax +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %esi, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: movl 100(%esp,%ebx), %ebp +; FALLBACK24-NEXT: movl %ebp, %edx +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl 104(%esp,%ebx), %esi +; FALLBACK24-NEXT: leal (%esi,%esi), %eax ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %eax -; FALLBACK24-NEXT: orl %ebp, %eax +; FALLBACK24-NEXT: orl %edx, %eax ; FALLBACK24-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK24-NEXT: movb %dl, %cl ; FALLBACK24-NEXT: shrl %cl, %edi -; FALLBACK24-NEXT: addl %ebx, %ebx +; FALLBACK24-NEXT: addl %ebp, %ebp ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %edi, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK24-NEXT: movl %edi, %ebp -; FALLBACK24-NEXT: movl %eax, %ecx -; FALLBACK24-NEXT: shrl %cl, %ebp -; FALLBACK24-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %edi, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl 108(%esp,%ebx), %edi +; FALLBACK24-NEXT: movl %edi, %eax +; FALLBACK24-NEXT: movl %edx, %ecx +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 112(%esp,%ebx), %ecx ; FALLBACK24-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK24-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK24-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %ebx -; FALLBACK24-NEXT: orl %ebp, %ebx -; FALLBACK24-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %eax, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %esi ; FALLBACK24-NEXT: addl %edi, %edi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %edi -; FALLBACK24-NEXT: orl %edx, %edi -; FALLBACK24-NEXT: movl %esi, %edx -; FALLBACK24-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK24-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK24-NEXT: movl %esi, %ebx -; FALLBACK24-NEXT: movb %al, %cl -; FALLBACK24-NEXT: shrl %cl, %ebx -; FALLBACK24-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK24-NEXT: leal (%eax,%eax), %ebp +; FALLBACK24-NEXT: orl %esi, %edi +; FALLBACK24-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK24-NEXT: movl %esi, %eax +; FALLBACK24-NEXT: movb %dl, %cl +; FALLBACK24-NEXT: shrl %cl, %eax +; FALLBACK24-NEXT: movl 120(%esp,%ebx), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %ebp -; FALLBACK24-NEXT: orl %ebx, %ebp +; FALLBACK24-NEXT: orl %eax, %ebp +; FALLBACK24-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK24-NEXT: movl %edx, %eax +; FALLBACK24-NEXT: movb %al, %cl ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK24-NEXT: shrl %cl, %ebx +; FALLBACK24-NEXT: shrl %cl, %edx ; FALLBACK24-NEXT: addl %esi, %esi ; FALLBACK24-NEXT: movb %ch, %cl ; FALLBACK24-NEXT: shll %cl, %esi -; FALLBACK24-NEXT: orl %ebx, %esi -; FALLBACK24-NEXT: movb %dl, %cl -; FALLBACK24-NEXT: shrl %cl, %eax -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK24-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK24-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK24-NEXT: orl %edx, %esi +; FALLBACK24-NEXT: movb %al, %cl +; FALLBACK24-NEXT: movl %ebx, %edx +; FALLBACK24-NEXT: shrl %cl, %edx +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK24-NEXT: movl 124(%esp,%ebx), %ebx +; FALLBACK24-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK24-NEXT: movb %ch, %cl -; FALLBACK24-NEXT: shll %cl, %edx -; FALLBACK24-NEXT: orl %eax, %edx -; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK24-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK24-NEXT: shll %cl, %ebp +; FALLBACK24-NEXT: orl %edx, %ebp +; FALLBACK24-NEXT: movl %eax, %ecx ; FALLBACK24-NEXT: sarl %cl, %ebx ; FALLBACK24-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK24-NEXT: movl %ebx, 60(%eax) -; FALLBACK24-NEXT: movl %edx, 56(%eax) +; FALLBACK24-NEXT: movl %ebp, 56(%eax) ; FALLBACK24-NEXT: movl %esi, 48(%eax) -; FALLBACK24-NEXT: movl %ebp, 52(%eax) +; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK24-NEXT: movl %ecx, 52(%eax) ; FALLBACK24-NEXT: movl %edi, 40(%eax) ; FALLBACK24-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK24-NEXT: movl %ecx, 44(%eax) @@ -23191,91 +23211,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK25-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK25-NEXT: movl %ecx, %ebp -; FALLBACK25-NEXT: andl $60, %ebp -; FALLBACK25-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shll $3, %ecx -; FALLBACK25-NEXT: andl $24, %ecx -; FALLBACK25-NEXT: shrdl %cl, %edx, %eax -; FALLBACK25-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %esi -; FALLBACK25-NEXT: shrdl %cl, %edi, %esi -; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx +; FALLBACK25-NEXT: movl %ecx, %eax +; FALLBACK25-NEXT: andl $60, %eax +; FALLBACK25-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK25-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx +; FALLBACK25-NEXT: shll $3, %ecx ; FALLBACK25-NEXT: shrdl %cl, %esi, %edx ; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi +; FALLBACK25-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK25-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %edi +; FALLBACK25-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK25-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %edi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi +; FALLBACK25-NEXT: shrdl %cl, %edx, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK25-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK25-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edx -; FALLBACK25-NEXT: shrdl %cl, %esi, %edx -; FALLBACK25-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl %esi, %edx -; FALLBACK25-NEXT: shrdl %cl, %eax, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK25-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi -; FALLBACK25-NEXT: shrdl %cl, %esi, %edi -; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK25-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK25-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK25-NEXT: movl %eax, %edi ; FALLBACK25-NEXT: shrdl %cl, %edx, %edi -; FALLBACK25-NEXT: shrdl %cl, %eax, %esi -; FALLBACK25-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK25-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK25-NEXT: shrdl %cl, %eax, %edx -; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK25-NEXT: movl %edx, 56(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK25-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK25-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %esi +; FALLBACK25-NEXT: shrdl %cl, %edi, %esi +; FALLBACK25-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl %edi, %esi ; FALLBACK25-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK25-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %ebx +; FALLBACK25-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK25-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK25-NEXT: shrdl %cl, %edx, %esi +; FALLBACK25-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK25-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK25-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK25-NEXT: movl %edx, %ebx +; FALLBACK25-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK25-NEXT: shrdl %cl, %edx, %edi +; FALLBACK25-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK25-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK25-NEXT: shrdl %cl, %edx, %esi +; FALLBACK25-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK25-NEXT: movl %esi, 56(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK25-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK25-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK25-NEXT: sarl %cl, %eax -; FALLBACK25-NEXT: movl %eax, 60(%ebp) -; FALLBACK25-NEXT: movl %esi, 48(%ebp) -; FALLBACK25-NEXT: movl %edi, 52(%ebp) -; FALLBACK25-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 40(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 44(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 32(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 36(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 24(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 28(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 16(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 20(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 8(%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 12(%ebp) -; FALLBACK25-NEXT: movl %ebx, (%ebp) -; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK25-NEXT: movl %eax, 4(%ebp) +; FALLBACK25-NEXT: sarl %cl, %edx +; FALLBACK25-NEXT: movl %edx, 60(%eax) +; FALLBACK25-NEXT: movl %edi, 48(%eax) +; FALLBACK25-NEXT: movl %ebx, 52(%eax) +; FALLBACK25-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 40(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 44(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 32(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 36(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 24(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 28(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 16(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 20(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 8(%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 12(%eax) +; FALLBACK25-NEXT: movl %ebp, (%eax) +; FALLBACK25-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK25-NEXT: movl %ecx, 4(%eax) ; FALLBACK25-NEXT: addl $188, %esp ; FALLBACK25-NEXT: popl %esi ; FALLBACK25-NEXT: popl %edi @@ -23325,7 +23344,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK26-NEXT: movl %eax, %ecx ; FALLBACK26-NEXT: leal (,%eax,8), %edx -; FALLBACK26-NEXT: andl $24, %edx ; FALLBACK26-NEXT: andl $60, %ecx ; FALLBACK26-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK26-NEXT: movl 72(%esp,%ecx), %edi @@ -23333,6 +23351,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK26-NEXT: shrxl %edx, %esi, %eax ; FALLBACK26-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK26-NEXT: movl %edx, %ebx +; FALLBACK26-NEXT: andb $24, %bl ; FALLBACK26-NEXT: notb %bl ; FALLBACK26-NEXT: leal (%edi,%edi), %ebp ; FALLBACK26-NEXT: shlxl %ebx, %ebp, %eax @@ -23506,7 +23525,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK27-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: shll $3, %ecx -; FALLBACK27-NEXT: andl $24, %ecx ; FALLBACK27-NEXT: shrdl %cl, %edx, %eax ; FALLBACK27-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK27-NEXT: movl 64(%esp,%ebp), %edi @@ -23598,191 +23616,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK28-NEXT: pushl %edi ; FALLBACK28-NEXT: pushl %esi ; FALLBACK28-NEXT: subl $204, %esp -; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %ecx -; FALLBACK28-NEXT: vmovups (%ecx), %ymm0 -; FALLBACK28-NEXT: vmovups 32(%ecx), %xmm1 -; FALLBACK28-NEXT: movl 48(%ecx), %edx -; FALLBACK28-NEXT: movl 52(%ecx), %esi -; FALLBACK28-NEXT: movl 56(%ecx), %edi -; FALLBACK28-NEXT: movl 60(%ecx), %ecx -; FALLBACK28-NEXT: movl (%eax), %eax -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK28-NEXT: vmovups (%eax), %ymm0 +; FALLBACK28-NEXT: vmovups 32(%eax), %xmm1 +; FALLBACK28-NEXT: movl 48(%eax), %edx +; FALLBACK28-NEXT: movl 52(%eax), %esi +; FALLBACK28-NEXT: movl 56(%eax), %edi +; FALLBACK28-NEXT: movl 60(%eax), %eax +; FALLBACK28-NEXT: movl (%ecx), %ecx +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edi, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %esi, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: movl %edx, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovaps %xmm1, {{[0-9]+}}(%esp) ; FALLBACK28-NEXT: vmovups %ymm0, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: sarl $31, %ecx -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; FALLBACK28-NEXT: movl %eax, %esi -; FALLBACK28-NEXT: andl $60, %esi -; FALLBACK28-NEXT: movl 68(%esp,%esi), %edx -; FALLBACK28-NEXT: shll $3, %eax -; FALLBACK28-NEXT: andl $24, %eax -; FALLBACK28-NEXT: movl %edx, %edi -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: movl 72(%esp,%esi), %ecx +; FALLBACK28-NEXT: sarl $31, %eax +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %eax, {{[0-9]+}}(%esp) +; FALLBACK28-NEXT: movl %ecx, %edx +; FALLBACK28-NEXT: movl %ecx, %eax +; FALLBACK28-NEXT: andl $60, %eax +; FALLBACK28-NEXT: movl 68(%esp,%eax), %edi +; FALLBACK28-NEXT: movl %eax, %ebx +; FALLBACK28-NEXT: shll $3, %edx +; FALLBACK28-NEXT: movl %edi, %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 72(%esp,%ebx), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx -; FALLBACK28-NEXT: movb %al, %ch +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebp +; FALLBACK28-NEXT: movb %dl, %ch +; FALLBACK28-NEXT: andb $24, %ch ; FALLBACK28-NEXT: notb %ch ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 64(%esp,%esi), %edi -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %eax, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 64(%esp,%ebx), %eax +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: addl %edi, %edi ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %edi, %edx +; FALLBACK28-NEXT: shll %cl, %edi +; FALLBACK28-NEXT: orl %eax, %edi +; FALLBACK28-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 76(%esp,%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, %eax +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 80(%esp,%ebx), %edi +; FALLBACK28-NEXT: leal (%edi,%edi), %esi +; FALLBACK28-NEXT: movb %ch, %cl +; FALLBACK28-NEXT: shll %cl, %esi +; FALLBACK28-NEXT: orl %eax, %esi +; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 76(%esp,%esi), %edx -; FALLBACK28-NEXT: movl %edx, %ebp -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 80(%esp,%esi), %edi -; FALLBACK28-NEXT: leal (%edi,%edi), %ebx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %eax, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: addl %edx, %edx +; FALLBACK28-NEXT: movl 84(%esp,%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, %eax +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 88(%esp,%ebx), %esi +; FALLBACK28-NEXT: leal (%esi,%esi), %edx ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %ebx, %edx +; FALLBACK28-NEXT: orl %eax, %edx ; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 84(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movl %eax, %edx -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 88(%esp,%esi), %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: addl %eax, %eax -; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax -; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: movb %bl, %cl ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 92(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 96(%esp,%esi), %edi +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; FALLBACK28-NEXT: movl 92(%esp,%eax), %ebp +; FALLBACK28-NEXT: movl %ebp, %edx +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl 96(%esp,%eax), %edi ; FALLBACK28-NEXT: leal (%edi,%edi), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: orl %edx, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: movb %bl, %cl +; FALLBACK28-NEXT: movl %ebx, %eax +; FALLBACK28-NEXT: shrl %cl, %esi +; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %eax, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 100(%esp,%esi), %ebx -; FALLBACK28-NEXT: movl %ebx, %ebp -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 104(%esp,%esi), %edx -; FALLBACK28-NEXT: leal (%edx,%edx), %eax +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %esi, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: movl 100(%esp,%ebx), %ebp +; FALLBACK28-NEXT: movl %ebp, %edx +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl 104(%esp,%ebx), %esi +; FALLBACK28-NEXT: leal (%esi,%esi), %eax ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %eax -; FALLBACK28-NEXT: orl %ebp, %eax +; FALLBACK28-NEXT: orl %edx, %eax ; FALLBACK28-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK28-NEXT: movb %dl, %cl ; FALLBACK28-NEXT: shrl %cl, %edi -; FALLBACK28-NEXT: addl %ebx, %ebx +; FALLBACK28-NEXT: addl %ebp, %ebp ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %edi, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 108(%esp,%esi), %edi -; FALLBACK28-NEXT: movl %edi, %ebp -; FALLBACK28-NEXT: movl %eax, %ecx -; FALLBACK28-NEXT: shrl %cl, %ebp -; FALLBACK28-NEXT: movl 112(%esp,%esi), %ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %edi, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl 108(%esp,%ebx), %edi +; FALLBACK28-NEXT: movl %edi, %eax +; FALLBACK28-NEXT: movl %edx, %ecx +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 112(%esp,%ebx), %ecx ; FALLBACK28-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebx +; FALLBACK28-NEXT: leal (%ecx,%ecx), %ebp ; FALLBACK28-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %ebx -; FALLBACK28-NEXT: orl %ebp, %ebx -; FALLBACK28-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %eax, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %esi ; FALLBACK28-NEXT: addl %edi, %edi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %edi -; FALLBACK28-NEXT: orl %edx, %edi -; FALLBACK28-NEXT: movl %esi, %edx -; FALLBACK28-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK28-NEXT: movl 116(%esp,%esi), %esi -; FALLBACK28-NEXT: movl %esi, %ebx -; FALLBACK28-NEXT: movb %al, %cl -; FALLBACK28-NEXT: shrl %cl, %ebx -; FALLBACK28-NEXT: movl 120(%esp,%edx), %eax -; FALLBACK28-NEXT: leal (%eax,%eax), %ebp +; FALLBACK28-NEXT: orl %esi, %edi +; FALLBACK28-NEXT: movl 116(%esp,%ebx), %esi +; FALLBACK28-NEXT: movl %esi, %eax +; FALLBACK28-NEXT: movb %dl, %cl +; FALLBACK28-NEXT: shrl %cl, %eax +; FALLBACK28-NEXT: movl 120(%esp,%ebx), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %ebp -; FALLBACK28-NEXT: orl %ebx, %ebp +; FALLBACK28-NEXT: orl %eax, %ebp +; FALLBACK28-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK28-NEXT: movl %edx, %eax +; FALLBACK28-NEXT: movb %al, %cl ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; FALLBACK28-NEXT: shrl %cl, %ebx +; FALLBACK28-NEXT: shrl %cl, %edx ; FALLBACK28-NEXT: addl %esi, %esi ; FALLBACK28-NEXT: movb %ch, %cl ; FALLBACK28-NEXT: shll %cl, %esi -; FALLBACK28-NEXT: orl %ebx, %esi -; FALLBACK28-NEXT: movb %dl, %cl -; FALLBACK28-NEXT: shrl %cl, %eax -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; FALLBACK28-NEXT: movl 124(%esp,%edx), %ebx -; FALLBACK28-NEXT: leal (%ebx,%ebx), %edx +; FALLBACK28-NEXT: orl %edx, %esi +; FALLBACK28-NEXT: movb %al, %cl +; FALLBACK28-NEXT: movl %ebx, %edx +; FALLBACK28-NEXT: shrl %cl, %edx +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; FALLBACK28-NEXT: movl 124(%esp,%ebx), %ebx +; FALLBACK28-NEXT: leal (%ebx,%ebx), %ebp ; FALLBACK28-NEXT: movb %ch, %cl -; FALLBACK28-NEXT: shll %cl, %edx -; FALLBACK28-NEXT: orl %eax, %edx -; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; FALLBACK28-NEXT: # kill: def $cl killed $cl killed $ecx +; FALLBACK28-NEXT: shll %cl, %ebp +; FALLBACK28-NEXT: orl %edx, %ebp +; FALLBACK28-NEXT: movl %eax, %ecx ; FALLBACK28-NEXT: sarl %cl, %ebx ; FALLBACK28-NEXT: movl {{[0-9]+}}(%esp), %eax ; FALLBACK28-NEXT: movl %ebx, 60(%eax) -; FALLBACK28-NEXT: movl %edx, 56(%eax) +; FALLBACK28-NEXT: movl %ebp, 56(%eax) ; FALLBACK28-NEXT: movl %esi, 48(%eax) -; FALLBACK28-NEXT: movl %ebp, 52(%eax) +; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK28-NEXT: movl %ecx, 52(%eax) ; FALLBACK28-NEXT: movl %edi, 40(%eax) ; FALLBACK28-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; FALLBACK28-NEXT: movl %ecx, 44(%eax) @@ -23853,91 +23875,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) ; FALLBACK29-NEXT: movl %eax, {{[0-9]+}}(%esp) -; FALLBACK29-NEXT: movl %ecx, %ebp -; FALLBACK29-NEXT: andl $60, %ebp -; FALLBACK29-NEXT: movl 56(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 52(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shll $3, %ecx -; FALLBACK29-NEXT: andl $24, %ecx -; FALLBACK29-NEXT: shrdl %cl, %edx, %eax -; FALLBACK29-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 64(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 60(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %esi -; FALLBACK29-NEXT: shrdl %cl, %edi, %esi -; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx +; FALLBACK29-NEXT: movl %ecx, %eax +; FALLBACK29-NEXT: andl $60, %eax +; FALLBACK29-NEXT: movl 56(%esp,%eax), %esi +; FALLBACK29-NEXT: movl 52(%esp,%eax), %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 72(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 68(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx +; FALLBACK29-NEXT: shll $3, %ecx ; FALLBACK29-NEXT: shrdl %cl, %esi, %edx ; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi +; FALLBACK29-NEXT: movl 64(%esp,%eax), %ebx +; FALLBACK29-NEXT: movl 60(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %edi +; FALLBACK29-NEXT: shrdl %cl, %ebx, %edi ; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 80(%esp,%ebp), %edi -; FALLBACK29-NEXT: movl 76(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %edi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi +; FALLBACK29-NEXT: shrdl %cl, %edx, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 72(%esp,%eax), %edi +; FALLBACK29-NEXT: movl 68(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 80(%esp,%eax), %ebx +; FALLBACK29-NEXT: movl 76(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %ebx, %esi ; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 88(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 84(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edx -; FALLBACK29-NEXT: shrdl %cl, %esi, %edx -; FALLBACK29-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl %esi, %edx -; FALLBACK29-NEXT: shrdl %cl, %eax, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: movl 96(%esp,%ebp), %esi -; FALLBACK29-NEXT: movl 92(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi -; FALLBACK29-NEXT: shrdl %cl, %esi, %edi -; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl %edx, (%esp) # 4-byte Spill -; FALLBACK29-NEXT: movl 104(%esp,%ebp), %edx -; FALLBACK29-NEXT: movl 100(%esp,%ebp), %eax -; FALLBACK29-NEXT: movl %eax, %edi ; FALLBACK29-NEXT: shrdl %cl, %edx, %edi -; FALLBACK29-NEXT: shrdl %cl, %eax, %esi -; FALLBACK29-NEXT: movl 48(%esp,%ebp), %ebx -; FALLBACK29-NEXT: movl 108(%esp,%ebp), %eax -; FALLBACK29-NEXT: shrdl %cl, %eax, %edx -; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %ebp -; FALLBACK29-NEXT: movl %edx, 56(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; FALLBACK29-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 88(%esp,%eax), %edi +; FALLBACK29-NEXT: movl 84(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %esi +; FALLBACK29-NEXT: shrdl %cl, %edi, %esi +; FALLBACK29-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl %edi, %esi ; FALLBACK29-NEXT: shrdl %cl, %edx, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: movl 96(%esp,%eax), %edi +; FALLBACK29-NEXT: movl 92(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %ebx +; FALLBACK29-NEXT: shrdl %cl, %edi, %ebx +; FALLBACK29-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; FALLBACK29-NEXT: shrdl %cl, %edx, %esi +; FALLBACK29-NEXT: movl %esi, (%esp) # 4-byte Spill +; FALLBACK29-NEXT: movl 104(%esp,%eax), %esi +; FALLBACK29-NEXT: movl 100(%esp,%eax), %edx +; FALLBACK29-NEXT: movl %edx, %ebx +; FALLBACK29-NEXT: shrdl %cl, %esi, %ebx +; FALLBACK29-NEXT: shrdl %cl, %edx, %edi +; FALLBACK29-NEXT: movl 48(%esp,%eax), %ebp +; FALLBACK29-NEXT: movl 108(%esp,%eax), %edx +; FALLBACK29-NEXT: shrdl %cl, %edx, %esi +; FALLBACK29-NEXT: movl {{[0-9]+}}(%esp), %eax +; FALLBACK29-NEXT: movl %esi, 56(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; FALLBACK29-NEXT: shrdl %cl, %esi, %ebp ; FALLBACK29-NEXT: # kill: def $cl killed $cl killed $ecx -; FALLBACK29-NEXT: sarl %cl, %eax -; FALLBACK29-NEXT: movl %eax, 60(%ebp) -; FALLBACK29-NEXT: movl %esi, 48(%ebp) -; FALLBACK29-NEXT: movl %edi, 52(%ebp) -; FALLBACK29-NEXT: movl (%esp), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 40(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 44(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 32(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 36(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 24(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 28(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 16(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 20(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 8(%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 12(%ebp) -; FALLBACK29-NEXT: movl %ebx, (%ebp) -; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; FALLBACK29-NEXT: movl %eax, 4(%ebp) +; FALLBACK29-NEXT: sarl %cl, %edx +; FALLBACK29-NEXT: movl %edx, 60(%eax) +; FALLBACK29-NEXT: movl %edi, 48(%eax) +; FALLBACK29-NEXT: movl %ebx, 52(%eax) +; FALLBACK29-NEXT: movl (%esp), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 40(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 44(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 32(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 36(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 24(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 28(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 16(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 20(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 8(%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 12(%eax) +; FALLBACK29-NEXT: movl %ebp, (%eax) +; FALLBACK29-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; FALLBACK29-NEXT: movl %ecx, 4(%eax) ; FALLBACK29-NEXT: addl $188, %esp ; FALLBACK29-NEXT: popl %esi ; FALLBACK29-NEXT: popl %edi @@ -23987,7 +24008,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; FALLBACK30-NEXT: movl %eax, %ecx ; FALLBACK30-NEXT: leal (,%eax,8), %edx -; FALLBACK30-NEXT: andl $24, %edx ; FALLBACK30-NEXT: andl $60, %ecx ; FALLBACK30-NEXT: movl 68(%esp,%ecx), %esi ; FALLBACK30-NEXT: movl 72(%esp,%ecx), %edi @@ -23995,6 +24015,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK30-NEXT: shrxl %edx, %esi, %eax ; FALLBACK30-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK30-NEXT: movl %edx, %ebx +; FALLBACK30-NEXT: andb $24, %bl ; FALLBACK30-NEXT: notb %bl ; FALLBACK30-NEXT: leal (%edi,%edi), %ebp ; FALLBACK30-NEXT: shlxl %ebx, %ebp, %eax @@ -24168,7 +24189,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind { ; FALLBACK31-NEXT: movl 52(%esp,%ebp), %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: shll $3, %ecx -; FALLBACK31-NEXT: andl $24, %ecx ; FALLBACK31-NEXT: shrdl %cl, %edx, %eax ; FALLBACK31-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; FALLBACK31-NEXT: movl 64(%esp,%ebp), %edi diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll index 338e104fbe8f0..b542d03bb24e2 100644 --- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll +++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll @@ -3088,113 +3088,107 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %edi ; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rdi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rdi), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes: @@ -3202,7 +3196,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 @@ -3210,7 +3204,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -3223,39 +3217,38 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r8d +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %r8d +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %r8d +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%r8), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%r8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%r8), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%r8), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%r8), %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%r8), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%r8), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%r8), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -3296,54 +3289,52 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r12b +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r14, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 @@ -3358,7 +3349,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 @@ -3366,7 +3357,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) @@ -3379,38 +3370,37 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r8d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %r8d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %r8d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%r8), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%r8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%r8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%r8), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%r8), %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%r8), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%r8), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%r8), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -3423,35 +3413,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) @@ -3459,7 +3449,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3479,174 +3469,166 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%edi), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%edi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edi), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%esi), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 56(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -3685,68 +3667,67 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl $188, %esp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi @@ -3845,8 +3826,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx @@ -3868,156 +3848,153 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ecx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ecx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -4054,149 +4031,150 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $188, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrxl %ecx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi @@ -4217,40 +4195,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi +; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax ; X64-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: negl %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movslq %esi, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: negl %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: movslq %ecx, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rbx), %r8 ; X64-NO-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rbx), %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X64-NO-BMI2-NO-SHLD-NEXT: andb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %r9 @@ -4327,33 +4305,32 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: negl %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movslq %eax, %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r9), %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r9), %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, %rsi @@ -4395,51 +4372,51 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq (%rdi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %esi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl (%rsi), %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %esi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movslq %ecx, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -64(%rsp,%rsi), %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -56(%rsp,%rsi), %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -40(%rsp,%rsi), %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -48(%rsp,%rsi), %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r14, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -24(%rsp,%rsi), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %r15 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r10, %r12 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %r13d +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $63, %r13b ; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r13b ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r9, %rbp +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -32(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r14, %r14 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %r14 @@ -4449,23 +4426,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rcx, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrq %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %r13, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r12, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 40(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r14, 24(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) @@ -4484,33 +4461,32 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 32(%rdi), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %esi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %esi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: negl %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movslq %eax, %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -48(%rsp,%r8), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -40(%rsp,%r8), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rsi @@ -4570,14 +4546,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%eax), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%eax), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%eax), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -4585,7 +4562,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -4609,196 +4587,198 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: subl %ecx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: subl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%edi), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %dl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%eax), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: negl %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 176(%esp,%eax), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 52(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 40(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 44(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 32(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 36(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 24(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 56(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 60(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 16(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 20(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 8(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 12(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 4(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $204, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: popl %edi @@ -4880,7 +4860,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill @@ -4972,38 +4951,38 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%ebp), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%ebp), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%ebp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) @@ -5011,7 +4990,9 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5032,125 +5013,127 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %ebp, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: subl %eax, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $31, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%edi), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %edx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%ebp), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 12(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 16(%edi), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %edx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, (%esp), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 32(%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%edi), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%ebp), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%ebp), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %edx, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%edi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%edi), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%esi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%esi), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, 188(%esp,%ecx), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: negl %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, 188(%esp,%ebp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%edi), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %edi, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ebx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, (%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 56(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 48(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -5161,7 +5144,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 32(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 36(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 24(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 28(%eax) @@ -5189,42 +5172,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: pushl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl $204, %esp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ebx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%ecx), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ebx), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ebx), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ebx), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ebx), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ebx), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ebx), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%ecx), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%ecx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%ecx), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%ecx), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%ecx), %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%ecx), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: xorps %xmm0, %xmm0 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) @@ -5249,48 +5232,47 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: leal {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebx, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: subl %ebp, %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 4(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 8(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 12(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 16(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 20(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 24(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 28(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 32(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 36(%eax), %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edi, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 40(%eax), %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 44(%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %ebx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi @@ -5298,8 +5280,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %esi, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebx), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: negl %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 176(%esp,%ebp), %ebp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 60(%eax) @@ -5308,13 +5290,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %ebp, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 48(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shldl %cl, %edx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 48(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, 40(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -5351,117 +5333,111 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes: ; X64-NO-BMI2-NO-SHLD: # %bb.0: -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: pushq %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax -; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: pushq %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq (%rdi), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq 8(%rdi), %rcx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 16(%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 24(%rdi), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 32(%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 40(%rdi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq 48(%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq 56(%rdi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl (%rsi), %edi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %eax +; X64-NO-BMI2-NO-SHLD-NEXT: sarq $63, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax ; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %r8d -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%r8), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: notl %edi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rsi, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi ; X64-NO-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%r8), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%r8), %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r8, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rdi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r11, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rdi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r14,%r14), %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%r8), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r11 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%r8), %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rbp,%rbp), %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r12 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: addq %r10, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbx, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rdi), %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r12 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rdi), %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r13,%r13), %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r15, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r15 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r12, %r15 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rbp -; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%r8), %rdi -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rbx, %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %rbp, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r14, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r13 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rdi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%rdi,%rdi), %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %esi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r13, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: sarq %cl, %rdi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rdi, 56(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, 48(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 32(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r12, 40(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 16(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: addq $8, %rsp +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 48(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 32(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r15, 40(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r11, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r12 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r13 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: popq %r15 -; X64-NO-BMI2-NO-SHLD-NEXT: popq %rbp ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes: @@ -5469,7 +5445,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 @@ -5477,7 +5453,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) @@ -5485,7 +5461,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -5495,38 +5471,37 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r8d +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %r8d +; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %r8d +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%r8), %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%r8), %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%r8), %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%r8), %r10 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%r8), %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%r8), %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%r8), %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%r8), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %rax +; X64-NO-BMI2-HAVE-SHLD-NEXT: sarq %cl, %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 56(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -5571,54 +5546,52 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %eax ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r13 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r11, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rax), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rax), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rdi, %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rax), %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rax), %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rsi, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r14, %r15 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r10, %rbp ; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %r12d -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %r12d +; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %r12b +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r9, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r9 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %rbp -; X64-HAVE-BMI2-NO-SHLD-NEXT: xorb $63, %sil -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbx, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rbx, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %r15 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r15, %r13 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -80(%rsp,%rax), %rbx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r13 ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -72(%rsp,%rax), %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: sarxq %rcx, %rax, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r9, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r15,%r15), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r10, %r10 ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r10, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r11, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %r11, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r8, %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r15, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r14, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %r14, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rbp, %r11 ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rax, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rsi, %rax, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %r12, %rax, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r13, %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 48(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r11, 32(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 40(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, 40(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 16(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r10, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %r12 @@ -5633,7 +5606,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: pushq %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rcx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq (%rdi), %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 8(%rdi), %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 16(%rdi), %r9 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 24(%rdi), %r10 @@ -5641,7 +5614,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 40(%rdi), %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 48(%rdi), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq 56(%rdi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %eax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%rsi), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, -{{[0-9]+}}(%rsp) @@ -5649,7 +5622,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, -{{[0-9]+}}(%rsp) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarq $63, %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) @@ -5659,37 +5632,36 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rax), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rax), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rax), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rax), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rax), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r8d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %r8d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %r8d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%r8), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%r8), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%r8), %r9 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r9, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%r8), %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%r8), %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r10, %rbx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%rax), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%rax), %r14 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -80(%rsp,%r8), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -88(%rsp,%r8), %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r14, %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r11, %r15 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r14, %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%rax), %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %rax, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -72(%rsp,%r8), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: sarxq %rcx, %r8, %rcx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 48(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 56(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rcx, 56(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 32(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r15, 40(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r14 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %r15 @@ -5706,7 +5678,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl (%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%eax), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%eax), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl 12(%eax), %ecx @@ -5753,7 +5725,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -5774,171 +5746,158 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movb %cl, (%esp) # 1-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebp), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl $3, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 68(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: xorb $31, %ch +; X86-NO-BMI2-NO-SHLD-NEXT: movl 72(%esp,%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 76(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 80(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 84(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebp), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edx,%edx), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 88(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 92(%esp,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 96(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl 100(%esp,%eax), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 104(%esp,%eax), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebp), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ecx,%ecx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb (%esp), %ch # 1-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebp), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebp), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movb %al, %cl -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: movl 124(%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movb %ch, %cl ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: sarl %cl, %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 60(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 60(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, 56(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 48(%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 52(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 40(%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) @@ -6002,9 +5961,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) @@ -6020,7 +5979,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -6031,25 +5990,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx +; X86-NO-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-NO-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: andl $60, %eax ; X86-NO-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %esi @@ -6168,42 +6126,40 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 36(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 40(%eax), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 44(%eax), %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 48(%eax), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 52(%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 56(%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 60(%eax), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl (%eax), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: sarl $31, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) @@ -6221,124 +6177,115 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: notl %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %al -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %eax, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $31, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrl $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 68(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 72(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: xorb $31, %bl +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, 64(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 80(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 76(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 88(%esp,%ecx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ebx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 84(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 96(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 92(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 104(%esp,%ecx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ebx), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 100(%esp,%ecx), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ebx), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 112(%esp,%ecx), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edi, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ebx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 108(%esp,%ecx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %esi, %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ebp, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %esi, %edi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%ebx), %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebx), %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %eax, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %ecx, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 120(%esp,%edi), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ecx, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 116(%esp,%edi), %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %eax, %eax -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %eax, %ebx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %eax, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %ebp, %eax +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 124(%esp,%ebp), %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: sarxl %edx, %ebp, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ebp, %ebp +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %ebp, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %eax, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edx, 60(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebx, 56(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ebp, 52(%eax) -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 40(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 52(%eax) +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, 40(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, 44(%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload @@ -6401,9 +6348,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%eax), %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%eax), %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%eax), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%ecx), %ecx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[0-9]+}}(%esp) @@ -6419,7 +6366,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload @@ -6430,106 +6377,107 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind { ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, {{[0-9]+}}(%esp) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $31, %ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarl $31, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrl $3, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: andl $60, %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 56(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 52(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 64(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 60(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 72(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 68(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 80(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 76(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%ebp), %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %edi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 88(%esp,%eax), %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 84(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 96(%esp,%eax), %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 92(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %ebp -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%eax), %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %ebx, %edi -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%eax), %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%eax), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 56(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, 48(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 52(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebp, 40(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 44(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 32(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 36(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 24(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 28(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 16(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 20(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 8(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 12(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 104(%esp,%ebp), %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 100(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edi, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %esi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 48(%esp,%ebp), %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl 108(%esp,%ebp), %edi +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ebp +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 56(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: sarxl %ecx, %edi, %eax +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 60(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %esi, 48(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 52(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 40(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 44(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 32(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 36(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 24(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 28(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 16(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 20(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 8(%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 12(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, 4(%eax) -; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %edx, 60(%eax) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: shrdl %cl, %eax, %ebx +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ebx, (%ebp) +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, 4(%ebp) ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: addl $188, %esp ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %esi ; X86-HAVE-BMI2-HAVE-SHLD-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll index 81c4d5d71084c..5a92987fd086a 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll @@ -434,8 +434,9 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -448,18 +449,19 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orb %dl, %bl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half: @@ -505,12 +507,12 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orb %dl, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -966,18 +968,25 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NEXT: xorps %xmm1, %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movl %ecx, %eax -; X64-NO-BMI2-NEXT: shrb $6, %al -; X64-NO-BMI2-NEXT: movzbl %al, %eax -; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax -; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NEXT: movb %al, (%rdx) +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: shrb $6, %cl +; X64-NO-BMI2-NEXT: movzbl %cl, %esi +; X64-NO-BMI2-NEXT: movl -64(%rsp,%rsi,8), %edi +; X64-NO-BMI2-NEXT: addl %edi, %edi +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: andb $56, %cl +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rsi,8), %rsi +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: orb %sil, %dil +; X64-NO-BMI2-NEXT: movb %dil, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -992,14 +1001,22 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: movl %esi, %eax ; X64-BMI2-NEXT: shrb $6, %al ; X64-BMI2-NEXT: movzbl %al, %eax -; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax -; X64-BMI2-NEXT: movb %al, (%rdx) +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI2-NEXT: andb $56, %sil +; X64-BMI2-NEXT: notb %sil +; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-BMI2-NEXT: addl %eax, %eax +; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax +; X64-BMI2-NEXT: orb %al, %cl +; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $68, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1012,18 +1029,19 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%esi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orb %dl, %bl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $68, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half: @@ -1069,12 +1087,12 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orb %dl, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -1892,18 +1910,20 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx -; X64-NO-BMI2-NEXT: andl $56, %ecx +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %eax +; X64-NO-BMI2-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi ; X64-NO-BMI2-NEXT: andl $56, %esi -; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rax -; X64-NO-BMI2-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi -; X64-NO-BMI2-NEXT: addl %esi, %esi -; X64-NO-BMI2-NEXT: notl %ecx -; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-NEXT: orl %eax, %esi -; X64-NO-BMI2-NEXT: movb %sil, (%rdx) +; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %edi +; X64-NO-BMI2-NEXT: addl %edi, %edi +; X64-NO-BMI2-NEXT: andl $56, %eax +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NEXT: movq -128(%rsp,%rsi), %rsi +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: orb %sil, %dil +; X64-NO-BMI2-NEXT: movb %dil, (%rdx) ; X64-NO-BMI2-NEXT: popq %rax ; X64-NO-BMI2-NEXT: retq ; @@ -1925,19 +1945,19 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: andl $56, %eax ; X64-BMI2-NEXT: andl $56, %esi ; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; X64-BMI2-NEXT: notl %eax +; X64-BMI2-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-BMI2-NEXT: notb %al ; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi ; X64-BMI2-NEXT: addl %esi, %esi ; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax -; X64-BMI2-NEXT: orl %eax, %ecx +; X64-BMI2-NEXT: orb %al, %cl ; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: popq %rax ; X64-BMI2-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: ; X86-NO-BMI2-NO-SHLD: # %bb.0: -; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: subl $136, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax @@ -1955,17 +1975,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edx,8), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl ; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: orb %bl, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $136, %esp -; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half: @@ -2015,14 +2035,15 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %edx, %ecx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shll $3, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orb %dl, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $136, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -2063,7 +2084,7 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi ; X64-NO-BMI2-NEXT: addl %esi, %esi -; X64-NO-BMI2-NEXT: notl %ecx +; X64-NO-BMI2-NEXT: notb %cl ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-NO-BMI2-NEXT: orl %eax, %esi @@ -2089,8 +2110,8 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: andl $56, %eax ; X64-BMI2-NEXT: andl $56, %esi ; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; X64-BMI2-NEXT: notl %eax +; X64-BMI2-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-BMI2-NEXT: notb %al ; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi ; X64-BMI2-NEXT: addl %esi, %esi ; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax @@ -2226,7 +2247,7 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-NO-BMI2-NEXT: shrq %cl, %rax ; X64-NO-BMI2-NEXT: movl -120(%rsp,%rsi), %esi ; X64-NO-BMI2-NEXT: addl %esi, %esi -; X64-NO-BMI2-NEXT: notl %ecx +; X64-NO-BMI2-NEXT: notb %cl ; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx ; X64-NO-BMI2-NEXT: shlq %cl, %rsi ; X64-NO-BMI2-NEXT: orl %eax, %esi @@ -2252,8 +2273,8 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X64-BMI2-NEXT: andl $56, %eax ; X64-BMI2-NEXT: andl $56, %esi ; X64-BMI2-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx -; X64-BMI2-NEXT: # kill: def $eax killed $eax killed $rax def $rax -; X64-BMI2-NEXT: notl %eax +; X64-BMI2-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-BMI2-NEXT: notb %al ; X64-BMI2-NEXT: movl -120(%rsp,%rsi), %esi ; X64-BMI2-NEXT: addl %esi, %esi ; X64-BMI2-NEXT: shlxq %rax, %rsi, %rax @@ -2453,10 +2474,10 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $140, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2466,27 +2487,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%ebx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ebx), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%edi,8), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edi), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ebx), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 8(%esp,%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, (%ecx) @@ -2503,11 +2523,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi ; X86-SHLD-NEXT: subl $128, %esp -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: movups (%edx), %xmm0 -; X86-SHLD-NEXT: movups 16(%edx), %xmm1 +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2517,19 +2537,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, (%esp) -; X86-SHLD-NEXT: movl %ecx, %esi -; X86-SHLD-NEXT: andl $60, %esi -; X86-SHLD-NEXT: movl 8(%esp,%esi), %edi -; X86-SHLD-NEXT: movl (%esp,%esi), %edx -; X86-SHLD-NEXT: movl 4(%esp,%esi), %esi -; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: movl %esi, %ebx -; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-SHLD-NEXT: leal (,%edx,8), %ecx +; X86-SHLD-NEXT: andl $60, %edx +; X86-SHLD-NEXT: movl 8(%esp,%edx), %esi +; X86-SHLD-NEXT: movl (%esp,%edx), %edi +; X86-SHLD-NEXT: movl 4(%esp,%edx), %edx +; X86-SHLD-NEXT: movl %edx, %ebx +; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-SHLD-NEXT: shrdl %cl, %edx, %edi ; X86-SHLD-NEXT: movl %ebx, 4(%eax) -; X86-SHLD-NEXT: movl %edx, (%eax) +; X86-SHLD-NEXT: movl %edi, (%eax) ; X86-SHLD-NEXT: addl $128, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi @@ -2557,12 +2575,12 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6 ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%ecx,8), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, (%esp,%ecx), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%ecx), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %edx, %edi, %ebx ; X86-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $dl killed $dl killed $edx def $edx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 8(%esp,%ecx), %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi @@ -2607,27 +2625,26 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %rdi +; X64-NO-BMI2-NO-SHLD-NEXT: andb $56, %cl ; X64-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %rdi, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: notb %al ; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r9, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %rsi ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 8(%rdx) -; X64-NO-BMI2-NO-SHLD-NEXT: movq %r10, (%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: popq %rax ; X64-NO-BMI2-NO-SHLD-NEXT: retq ; @@ -2646,23 +2663,20 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %edi ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r9 +; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %cl ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi ; X64-NO-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r10, %rsi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r8 -; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r8, (%rdx) +; X64-NO-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-NO-BMI2-HAVE-SHLD-NEXT: popq %rax ; X64-NO-BMI2-HAVE-SHLD-NEXT: retq @@ -2682,24 +2696,23 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $56, %dil +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %dil ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r8,%r8), %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rdi, %r9, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rax, %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r9, 8(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, (%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-NO-SHLD-NEXT: retq ; @@ -2718,21 +2731,20 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rdi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rdi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rsi, %r8 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rax, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rdi, %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r9d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r9b +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r9, %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r8, %rsi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rsi, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r8, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rdi, %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, (%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rax ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; @@ -2743,10 +2755,10 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $156, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2756,37 +2768,38 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%esi,8), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 16(%esp,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 20(%esp,%esi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 24(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl 28(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax @@ -2796,8 +2809,7 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 12(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 8(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 4(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, 4(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, (%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: addl $156, %esp @@ -2814,10 +2826,10 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi ; X86-SHLD-NEXT: subl $156, %esp -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movups (%eax), %xmm0 -; X86-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -2827,27 +2839,25 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movl %ecx, %edi -; X86-SHLD-NEXT: andl $60, %edi -; X86-SHLD-NEXT: movl 24(%esp,%edi), %esi -; X86-SHLD-NEXT: movl 16(%esp,%edi), %eax -; X86-SHLD-NEXT: movl 20(%esp,%edi), %ebx +; X86-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-SHLD-NEXT: andl $60, %eax +; X86-SHLD-NEXT: movl 24(%esp,%eax), %edi +; X86-SHLD-NEXT: movl 16(%esp,%eax), %edx +; X86-SHLD-NEXT: movl 20(%esp,%eax), %ebx ; X86-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebx -; X86-SHLD-NEXT: movl 28(%esp,%edi), %ebp -; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi -; X86-SHLD-NEXT: movl 32(%esp,%edi), %edi -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx -; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp -; X86-SHLD-NEXT: movl %ebp, 12(%edx) -; X86-SHLD-NEXT: movl %esi, 8(%edx) -; X86-SHLD-NEXT: movl %ebx, 4(%edx) +; X86-SHLD-NEXT: shrdl %cl, %edi, %ebx +; X86-SHLD-NEXT: movl 28(%esp,%eax), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %edi +; X86-SHLD-NEXT: movl 32(%esp,%eax), %eax +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi +; X86-SHLD-NEXT: shrdl %cl, %eax, %ebp +; X86-SHLD-NEXT: movl %ebp, 12(%esi) +; X86-SHLD-NEXT: movl %edi, 8(%esi) +; X86-SHLD-NEXT: movl %ebx, 4(%esi) ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload -; X86-SHLD-NEXT: shrdl %cl, %esi, %eax -; X86-SHLD-NEXT: movl %eax, (%edx) +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SHLD-NEXT: shrdl %cl, %eax, %edx +; X86-SHLD-NEXT: movl %edx, (%esi) ; X86-SHLD-NEXT: addl $156, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi @@ -2876,13 +2886,13 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edx, %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx @@ -2943,43 +2953,42 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %edi ; X64-NO-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r8 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi +; X64-NO-BMI2-NO-SHLD-NEXT: andb $56, %dil +; X64-NO-BMI2-NO-SHLD-NEXT: notb %dil +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r9 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r8, %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %r8d +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %r8d ; X64-NO-BMI2-NO-SHLD-NEXT: notb %r8b -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r9 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %rbx ; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r9 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: notl %eax -; X64-NO-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-NO-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rbx -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rbx +; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: leaq (%r11,%r11), %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx ; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %r14 -; X64-NO-BMI2-NO-SHLD-NEXT: movl %edi, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r11 -; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %r14 ; X64-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: orq %r11, %rsi -; X64-NO-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-NO-BMI2-NO-SHLD-NEXT: shrq %cl, %r10 +; X64-NO-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rax +; X64-NO-BMI2-NO-SHLD-NEXT: addq %rax, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movl %r8d, %ecx +; X64-NO-BMI2-NO-SHLD-NEXT: shlq %cl, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: orq %r10, %rax +; X64-NO-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r14, 16(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %rbx, 8(%rdx) ; X64-NO-BMI2-NO-SHLD-NEXT: movq %r9, (%rdx) @@ -3004,17 +3013,15 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %eax -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %eax +; X64-NO-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $56, %esi ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %r8 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r9 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq %r9, %r10 ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx ; X64-NO-BMI2-HAVE-SHLD-NEXT: shrq %cl, %r10 -; X64-NO-BMI2-HAVE-SHLD-NEXT: notl %edi -; X64-NO-BMI2-HAVE-SHLD-NEXT: andl $63, %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %eax, %edi +; X64-NO-BMI2-HAVE-SHLD-NEXT: notb %dil ; X64-NO-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11 ; X64-NO-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx ; X64-NO-BMI2-HAVE-SHLD-NEXT: movl %edi, %ecx @@ -3058,36 +3065,35 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-NO-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %ecx ; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, -128(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, -128(%rsp,%rsi), %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r8, %r10 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %r9, %r11 -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %rbx -; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rcx, %rbx, %r14 -; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx def $rcx -; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %cl +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -112(%rsp,%rsi), %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %rdi, %r9 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -104(%rsp,%rsi), %r10 +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r8, %r11 +; X64-HAVE-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shrxq %rax, %r10, %r14 +; X64-HAVE-BMI2-NO-SHLD-NEXT: # kill: def $al killed $al killed $rax def $rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: andb $56, %al +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %al ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r8, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rdi, %r8 -; X64-HAVE-BMI2-NO-SHLD-NEXT: notl %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r9,%r9), %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rdi, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r10, %rdi -; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%rbx,%rbx), %r9 -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rcx, %r9, %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %r8, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %rcx, %r8 +; X64-HAVE-BMI2-NO-SHLD-NEXT: leaq (%r10,%r10), %rcx +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rcx, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r9, %rax +; X64-HAVE-BMI2-NO-SHLD-NEXT: notb %bl +; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rdi, %rdi +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbx, %rdi, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r11, %rcx ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rax -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 16(%rdx) -; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rdi, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: shlxq %rbx, %rsi, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: orq %r14, %rsi +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rcx, 8(%rdx) +; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %rax, 16(%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: movq %r8, (%rdx) ; X64-HAVE-BMI2-NO-SHLD-NEXT: addq $8, %rsp ; X64-HAVE-BMI2-NO-SHLD-NEXT: popq %rbx @@ -3108,32 +3114,30 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %eax, %ecx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %ecx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leal (,%rsi,8), %ecx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $56, %esi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %r8 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -120(%rsp,%rsi), %rax +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %rax, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r8 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r8, %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notl %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: andl $63, %eax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r10 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r10,%r10), %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %r11, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %r11 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -104(%rsp,%rsi), %r9 -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrxq %rcx, %r9, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movl %ecx, %r10d +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: notb %r10b +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -112(%rsp,%rsi), %r11 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: leaq (%r11,%r11), %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rbx, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rdi, %rbx +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -128(%rsp,%rsi), %rdi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq -96(%rsp,%rsi), %rsi ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: addq %rsi, %rsi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %rax, %rsi, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %rbx, %rax -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r9, %r10 +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shlxq %r10, %rsi, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: orq %r9, %rsi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %r11 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: # kill: def $cl killed $cl killed $rcx -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %r8, %rdi -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r10, 16(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: shrdq %cl, %rax, %rdi +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 16(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rdi, (%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rax, 24(%rdx) -; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %r11, 8(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rsi, 24(%rdx) +; X64-HAVE-BMI2-HAVE-SHLD-NEXT: movq %rbx, 8(%rdx) ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: popq %rbx ; X64-HAVE-BMI2-HAVE-SHLD-NEXT: retq ; @@ -3144,10 +3148,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %edi ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi ; X86-NO-BMI2-NO-SHLD-NEXT: subl $172, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %esi ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: movups (%ecx), %xmm0 -; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%ecx), %xmm1 +; X86-NO-BMI2-NO-SHLD-NEXT: movups (%eax), %xmm0 +; X86-NO-BMI2-NO-SHLD-NEXT: movups 16(%eax), %xmm1 ; X86-NO-BMI2-NO-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -3157,80 +3161,83 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %edi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: shll $3, %eax -; X86-NO-BMI2-NO-SHLD-NEXT: andl $24, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (,%esi,8), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: andl $60, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 32(%esp,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 36(%esp,%esi), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: andb $24, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: notb %dl ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 40(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 44(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%edi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 48(%esp,%esi), %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%edi), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl 52(%esp,%esi), %edi +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%edi,%edi), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%edi), %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl 56(%esp,%esi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%eax,%eax), %ebp ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebp -; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%edi), %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %esi +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: orl %edi, %ebp +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebx, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 60(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: leal (%ebx,%ebx), %edi ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebp, %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, %ecx +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: orl %eax, %edi +; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %ebx -; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%edi), %eax +; X86-NO-BMI2-NO-SHLD-NEXT: movl 64(%esp,%esi), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: addl %eax, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl %edx, %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: orl %ebx, %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 28(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl %esi, 24(%ecx) -; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 20(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %edi, 24(%ecx) +; X86-NO-BMI2-NO-SHLD-NEXT: movl %ebp, 20(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-NO-BMI2-NO-SHLD-NEXT: movl %eax, 16(%ecx) ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload @@ -3255,10 +3262,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: pushl %edi ; X86-SHLD-NEXT: pushl %esi ; X86-SHLD-NEXT: subl $156, %esp -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax -; X86-SHLD-NEXT: movups (%eax), %xmm0 -; X86-SHLD-NEXT: movups 16(%eax), %xmm1 +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx +; X86-SHLD-NEXT: movups (%ecx), %xmm0 +; X86-SHLD-NEXT: movups 16(%ecx), %xmm1 ; X86-SHLD-NEXT: xorps %xmm2, %xmm2 ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) @@ -3268,46 +3275,44 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-SHLD-NEXT: movaps %xmm2, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) -; X86-SHLD-NEXT: movl %ecx, %edi -; X86-SHLD-NEXT: andl $60, %edi -; X86-SHLD-NEXT: movl 24(%esp,%edi), %edx -; X86-SHLD-NEXT: movl 20(%esp,%edi), %esi -; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SHLD-NEXT: shll $3, %ecx -; X86-SHLD-NEXT: andl $24, %ecx -; X86-SHLD-NEXT: movl %edx, %eax -; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SHLD-NEXT: movl 28(%esp,%edi), %edx -; X86-SHLD-NEXT: shrdl %cl, %edx, %eax -; X86-SHLD-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SHLD-NEXT: movl 32(%esp,%edi), %ebp -; X86-SHLD-NEXT: shrdl %cl, %ebp, %edx -; X86-SHLD-NEXT: movl %edx, (%esp) # 4-byte Spill -; X86-SHLD-NEXT: movl 36(%esp,%edi), %esi -; X86-SHLD-NEXT: shrdl %cl, %esi, %ebp -; X86-SHLD-NEXT: movl 40(%esp,%edi), %edx +; X86-SHLD-NEXT: leal (,%eax,8), %ecx +; X86-SHLD-NEXT: andl $60, %eax +; X86-SHLD-NEXT: movl 24(%esp,%eax), %esi +; X86-SHLD-NEXT: movl 20(%esp,%eax), %edi +; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movl %esi, %edx +; X86-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movl 28(%esp,%eax), %esi +; X86-SHLD-NEXT: shrdl %cl, %esi, %edx +; X86-SHLD-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SHLD-NEXT: movl 32(%esp,%eax), %ebp +; X86-SHLD-NEXT: shrdl %cl, %ebp, %esi +; X86-SHLD-NEXT: movl %esi, (%esp) # 4-byte Spill +; X86-SHLD-NEXT: movl 36(%esp,%eax), %edi +; X86-SHLD-NEXT: shrdl %cl, %edi, %ebp +; X86-SHLD-NEXT: movl 40(%esp,%eax), %esi +; X86-SHLD-NEXT: shrdl %cl, %esi, %edi +; X86-SHLD-NEXT: movl 44(%esp,%eax), %edx ; X86-SHLD-NEXT: shrdl %cl, %edx, %esi -; X86-SHLD-NEXT: movl 44(%esp,%edi), %eax +; X86-SHLD-NEXT: movl 16(%esp,%eax), %ebx +; X86-SHLD-NEXT: movl 48(%esp,%eax), %eax ; X86-SHLD-NEXT: shrdl %cl, %eax, %edx -; X86-SHLD-NEXT: movl 16(%esp,%edi), %ebx -; X86-SHLD-NEXT: movl 48(%esp,%edi), %edi -; X86-SHLD-NEXT: shrdl %cl, %edi, %eax -; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edi -; X86-SHLD-NEXT: movl %eax, 28(%edi) -; X86-SHLD-NEXT: movl %edx, 24(%edi) -; X86-SHLD-NEXT: movl %esi, 20(%edi) -; X86-SHLD-NEXT: movl %ebp, 16(%edi) -; X86-SHLD-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-SHLD-NEXT: movl %eax, 12(%edi) -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SHLD-NEXT: movl %eax, 8(%edi) -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SHLD-NEXT: movl %eax, 4(%edi) +; X86-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-SHLD-NEXT: movl %edx, 28(%eax) +; X86-SHLD-NEXT: movl %esi, 24(%eax) +; X86-SHLD-NEXT: movl %edi, 20(%eax) +; X86-SHLD-NEXT: movl %ebp, 16(%eax) +; X86-SHLD-NEXT: movl (%esp), %edx # 4-byte Reload +; X86-SHLD-NEXT: movl %edx, 12(%eax) +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SHLD-NEXT: movl %edx, 8(%eax) +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SHLD-NEXT: movl %edx, 4(%eax) ; X86-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SHLD-NEXT: shrdl %cl, %eax, %ebx -; X86-SHLD-NEXT: movl %ebx, (%edi) +; X86-SHLD-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload +; X86-SHLD-NEXT: shrdl %cl, %edx, %ebx +; X86-SHLD-NEXT: movl %ebx, (%eax) ; X86-SHLD-NEXT: addl $156, %esp ; X86-SHLD-NEXT: popl %esi ; X86-SHLD-NEXT: popl %edi @@ -3336,23 +3341,23 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm1, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movaps %xmm0, {{[0-9]+}}(%esp) ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (,%eax,8), %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $24, %ecx ; X86-HAVE-BMI2-NO-SHLD-NEXT: andl $60, %eax ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, 16(%esp,%eax), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 20(%esp,%eax), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %edi, %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %ebx +; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $24, %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %bl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 24(%esp,%eax), %ebp -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edx, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, %ebp, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %esi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %edi, %esi -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%ebp,%ebp), %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %edi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 28(%esp,%eax), %esi ; X86-HAVE-BMI2-NO-SHLD-NEXT: leal (%esi,%esi), %edi ; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ebx, %edi, %edi diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll index 8d36eef952a2b..17cab42d476fd 100644 --- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll +++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll @@ -605,8 +605,9 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -618,18 +619,19 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-NO-BMI2-NO-SHLD-NEXT: andb $12, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%esi), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $40, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orb %dl, %bl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $36, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca: @@ -673,12 +675,12 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $3, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: andb $12, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orb %dl, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $40, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi @@ -1224,19 +1226,26 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-NO-BMI2: # %bb.0: ; X64-NO-BMI2-NEXT: movups (%rdi), %xmm0 ; X64-NO-BMI2-NEXT: movups 16(%rdi), %xmm1 -; X64-NO-BMI2-NEXT: leal (,%rsi,8), %ecx +; X64-NO-BMI2-NEXT: leal (,%rsi,8), %eax ; X64-NO-BMI2-NEXT: xorps %xmm2, %xmm2 ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm2, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp) ; X64-NO-BMI2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) -; X64-NO-BMI2-NEXT: movl %ecx, %eax -; X64-NO-BMI2-NEXT: shrb $6, %al -; X64-NO-BMI2-NEXT: movzbl %al, %eax -; X64-NO-BMI2-NEXT: movq -72(%rsp,%rax,8), %rax -; X64-NO-BMI2-NEXT: # kill: def $cl killed $cl killed $ecx -; X64-NO-BMI2-NEXT: shrq %cl, %rax -; X64-NO-BMI2-NEXT: movb %al, (%rdx) +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: shrb $6, %cl +; X64-NO-BMI2-NEXT: movzbl %cl, %esi +; X64-NO-BMI2-NEXT: movl -64(%rsp,%rsi,8), %edi +; X64-NO-BMI2-NEXT: addl %edi, %edi +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: andb $56, %cl +; X64-NO-BMI2-NEXT: notb %cl +; X64-NO-BMI2-NEXT: shlq %cl, %rdi +; X64-NO-BMI2-NEXT: movq -72(%rsp,%rsi,8), %rsi +; X64-NO-BMI2-NEXT: movl %eax, %ecx +; X64-NO-BMI2-NEXT: shrq %cl, %rsi +; X64-NO-BMI2-NEXT: orb %sil, %dil +; X64-NO-BMI2-NEXT: movb %dil, (%rdx) ; X64-NO-BMI2-NEXT: retq ; ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca: @@ -1252,14 +1261,22 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X64-BMI2-NEXT: movl %esi, %eax ; X64-BMI2-NEXT: shrb $6, %al ; X64-BMI2-NEXT: movzbl %al, %eax -; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rax -; X64-BMI2-NEXT: movb %al, (%rdx) +; X64-BMI2-NEXT: shrxq %rsi, -72(%rsp,%rax,8), %rcx +; X64-BMI2-NEXT: # kill: def $sil killed $sil killed $rsi def $rsi +; X64-BMI2-NEXT: andb $56, %sil +; X64-BMI2-NEXT: notb %sil +; X64-BMI2-NEXT: movl -64(%rsp,%rax,8), %eax +; X64-BMI2-NEXT: addl %eax, %eax +; X64-BMI2-NEXT: shlxq %rsi, %rax, %rax +; X64-BMI2-NEXT: orb %al, %cl +; X64-BMI2-NEXT: movb %cl, (%rdx) ; X64-BMI2-NEXT: retq ; ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: ; X86-NO-BMI2-NO-SHLD: # %bb.0: +; X86-NO-BMI2-NO-SHLD-NEXT: pushl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: pushl %esi -; X86-NO-BMI2-NO-SHLD-NEXT: subl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: subl $68, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %eax ; X86-NO-BMI2-NO-SHLD-NEXT: movl {{[0-9]+}}(%esp), %edx @@ -1273,18 +1290,19 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-NO-BMI2-NO-SHLD-NEXT: movaps %xmm0, (%esp) ; X86-NO-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: shrb $5, %dl -; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%edx,4), %esi -; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movzbl %dl, %esi +; X86-NO-BMI2-NO-SHLD-NEXT: movl (%esp,%esi,4), %edx +; X86-NO-BMI2-NO-SHLD-NEXT: movl 4(%esp,%esi,4), %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: shrl %cl, %edx ; X86-NO-BMI2-NO-SHLD-NEXT: notb %cl -; X86-NO-BMI2-NO-SHLD-NEXT: addl %edx, %edx +; X86-NO-BMI2-NO-SHLD-NEXT: addl %ebx, %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: orl %esi, %edx -; X86-NO-BMI2-NO-SHLD-NEXT: movb %dl, (%eax) -; X86-NO-BMI2-NO-SHLD-NEXT: addl $72, %esp +; X86-NO-BMI2-NO-SHLD-NEXT: shll %cl, %ebx +; X86-NO-BMI2-NO-SHLD-NEXT: orb %dl, %bl +; X86-NO-BMI2-NO-SHLD-NEXT: movb %bl, (%eax) +; X86-NO-BMI2-NO-SHLD-NEXT: addl $68, %esp ; X86-NO-BMI2-NO-SHLD-NEXT: popl %esi +; X86-NO-BMI2-NO-SHLD-NEXT: popl %ebx ; X86-NO-BMI2-NO-SHLD-NEXT: retl ; ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca: @@ -1332,12 +1350,12 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst) ; X86-HAVE-BMI2-NO-SHLD-NEXT: movl %ecx, %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: shrb $5, %dl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movzbl %dl, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shrxl %ecx, (%esp,%edx,4), %edx ; X86-HAVE-BMI2-NO-SHLD-NEXT: notb %cl -; X86-HAVE-BMI2-NO-SHLD-NEXT: movl 4(%esp,%edx,4), %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %edx, %edx -; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %edx, %ecx -; X86-HAVE-BMI2-NO-SHLD-NEXT: orl %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: addl %esi, %esi +; X86-HAVE-BMI2-NO-SHLD-NEXT: shlxl %ecx, %esi, %ecx +; X86-HAVE-BMI2-NO-SHLD-NEXT: orb %dl, %cl ; X86-HAVE-BMI2-NO-SHLD-NEXT: movb %cl, (%eax) ; X86-HAVE-BMI2-NO-SHLD-NEXT: addl $72, %esp ; X86-HAVE-BMI2-NO-SHLD-NEXT: popl %esi diff --git a/llvm/test/CodeGen/X86/xor-icmp.ll b/llvm/test/CodeGen/X86/xor-icmp.ll index 16a3b6cb855a7..59db27ef5e6dd 100644 --- a/llvm/test/CodeGen/X86/xor-icmp.ll +++ b/llvm/test/CodeGen/X86/xor-icmp.ll @@ -15,11 +15,16 @@ define i32 @t(i32 %a, i32 %b) nounwind ssp { ; ; X64-LABEL: t: ; X64: # %bb.0: # %entry -; X64-NEXT: xorl %esi, %edi +; X64-NEXT: shrl $14, %edi +; X64-NEXT: andb $1, %dil +; X64-NEXT: btl $14, %esi +; X64-NEXT: sbbb $0, %dil +; X64-NEXT: je .LBB0_1 +; X64-NEXT: # %bb.2: # %bb1 +; X64-NEXT: xorl %eax, %eax +; X64-NEXT: jmp bar # TAILCALL +; X64-NEXT: .LBB0_1: # %bb ; X64-NEXT: xorl %eax, %eax -; X64-NEXT: testl $16384, %edi # imm = 0x4000 -; X64-NEXT: jne bar # TAILCALL -; X64-NEXT: # %bb.1: # %bb ; X64-NEXT: jmp foo # TAILCALL entry: %0 = and i32 %a, 16384 @@ -96,7 +101,7 @@ define i1 @xor_not_bools(i1 zeroext %x, i1 zeroext %y) nounwind { ; X64-LABEL: xor_not_bools: ; X64: # %bb.0: ; X64-NEXT: movl %edi, %eax -; X64-NEXT: xorl %esi, %eax +; X64-NEXT: xorb %sil, %al ; X64-NEXT: xorb $1, %al ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/xor-with-overflow.ll b/llvm/test/CodeGen/X86/xor-with-overflow.ll index 5d22302d39add..e025989156e9e 100644 --- a/llvm/test/CodeGen/X86/xor-with-overflow.ll +++ b/llvm/test/CodeGen/X86/xor-with-overflow.ll @@ -46,8 +46,8 @@ define i8 @xor_i8_rr(i8 zeroext %0, i8 zeroext %1) { ; ; X64-LABEL: xor_i8_rr: ; X64: # %bb.0: -; X64-NEXT: movl %esi, %eax -; X64-NEXT: xorl %edi, %eax +; X64-NEXT: xorb %dil, %sil +; X64-NEXT: movzbl %sil, %eax ; X64-NEXT: cmovel %edi, %eax ; X64-NEXT: # kill: def $al killed $al killed $eax ; X64-NEXT: retq diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll index 93203ef6e17f5..1775733732c9e 100644 --- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll +++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll @@ -312,22 +312,18 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou ; X64-NEXT: .p2align 4 ; X64-NEXT: .LBB3_1: # %for.body ; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: movzbl -3(%rdi,%rax), %ecx -; X64-NEXT: movzbl -3(%rsi,%rax), %r8d -; X64-NEXT: addl %ecx, %r8d -; X64-NEXT: movb %r8b, -3(%rdx,%rax) -; X64-NEXT: movzbl -2(%rdi,%rax), %ecx -; X64-NEXT: movzbl -2(%rsi,%rax), %r8d -; X64-NEXT: addl %ecx, %r8d -; X64-NEXT: movb %r8b, -2(%rdx,%rax) -; X64-NEXT: movzbl -1(%rdi,%rax), %ecx -; X64-NEXT: movzbl -1(%rsi,%rax), %r8d -; X64-NEXT: addl %ecx, %r8d -; X64-NEXT: movb %r8b, -1(%rdx,%rax) -; X64-NEXT: movzbl (%rdi,%rax), %ecx -; X64-NEXT: movzbl (%rsi,%rax), %r8d -; X64-NEXT: addl %ecx, %r8d -; X64-NEXT: movb %r8b, (%rdx,%rax) +; X64-NEXT: movzbl -3(%rsi,%rax), %ecx +; X64-NEXT: addb -3(%rdi,%rax), %cl +; X64-NEXT: movb %cl, -3(%rdx,%rax) +; X64-NEXT: movzbl -2(%rsi,%rax), %ecx +; X64-NEXT: addb -2(%rdi,%rax), %cl +; X64-NEXT: movb %cl, -2(%rdx,%rax) +; X64-NEXT: movzbl -1(%rsi,%rax), %ecx +; X64-NEXT: addb -1(%rdi,%rax), %cl +; X64-NEXT: movb %cl, -1(%rdx,%rax) +; X64-NEXT: movzbl (%rsi,%rax), %ecx +; X64-NEXT: addb (%rdi,%rax), %cl +; X64-NEXT: movb %cl, (%rdx,%rax) ; X64-NEXT: addq $4, %rax ; X64-NEXT: cmpl $403, %eax # imm = 0x193 ; X64-NEXT: jne .LBB3_1 @@ -337,7 +333,6 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou ; X32-LABEL: foldedidx: ; X32: # %bb.0: # %entry ; X32-NEXT: pushl %ebx -; X32-NEXT: pushl %edi ; X32-NEXT: pushl %esi ; X32-NEXT: movl $3, %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx @@ -346,28 +341,23 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou ; X32-NEXT: .p2align 4 ; X32-NEXT: .LBB3_1: # %for.body ; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: movzbl -3(%esi,%eax), %edi ; X32-NEXT: movzbl -3(%edx,%eax), %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addb -3(%esi,%eax), %bl ; X32-NEXT: movb %bl, -3(%ecx,%eax) -; X32-NEXT: movzbl -2(%esi,%eax), %edi ; X32-NEXT: movzbl -2(%edx,%eax), %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addb -2(%esi,%eax), %bl ; X32-NEXT: movb %bl, -2(%ecx,%eax) -; X32-NEXT: movzbl -1(%esi,%eax), %edi ; X32-NEXT: movzbl -1(%edx,%eax), %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addb -1(%esi,%eax), %bl ; X32-NEXT: movb %bl, -1(%ecx,%eax) -; X32-NEXT: movzbl (%esi,%eax), %edi ; X32-NEXT: movzbl (%edx,%eax), %ebx -; X32-NEXT: addl %edi, %ebx +; X32-NEXT: addb (%esi,%eax), %bl ; X32-NEXT: movb %bl, (%ecx,%eax) ; X32-NEXT: addl $4, %eax ; X32-NEXT: cmpl $403, %eax # imm = 0x193 ; X32-NEXT: jne .LBB3_1 ; X32-NEXT: # %bb.2: # %for.end ; X32-NEXT: popl %esi -; X32-NEXT: popl %edi ; X32-NEXT: popl %ebx ; X32-NEXT: retl entry: