diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index 9d6584ad3faa0..26cb7cbbc5d72 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -3225,29 +3225,51 @@ bool AMDGPUDAGToDAGISel::SelectVOP3ModsImpl(SDValue In, SDValue &Src, if (IsCanonicalizing) return true; - unsigned Opc = Src->getOpcode(); + // v2i32 xor/or/and are legal. A vselect using these instructions as operands + // is scalarised into two selects with EXTRACT_VECTOR_ELT operands. Peek + // through the extract to the bitwise op. + SDValue PeekSrc = + Src->getOpcode() == ISD::EXTRACT_VECTOR_ELT ? Src->getOperand(0) : Src; + // Convert various sign-bit masks to src mods. Currently disabled for 16-bit + // types as the codegen replaces the operand without adding a srcmod. + // This is intentionally finding the cases where we are performing float neg + // and abs on int types, the goal is not to obtain two's complement neg or + // abs. + // TODO: Add 16-bit support. + unsigned Opc = PeekSrc.getOpcode(); EVT VT = Src.getValueType(); if ((Opc != ISD::AND && Opc != ISD::OR && Opc != ISD::XOR) || - (VT != MVT::i32 && VT != MVT::i64)) + (VT != MVT::i32 && VT != MVT::v2i32 && VT != MVT::i64)) return true; - ConstantSDNode *CRHS = dyn_cast(Src->getOperand(1)); + ConstantSDNode *CRHS = isConstOrConstSplat(PeekSrc ? PeekSrc->getOperand(1) + : Src->getOperand(1)); if (!CRHS) return true; + auto ReplaceSrc = [&]() -> SDValue { + if (Src->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return PeekSrc.getOperand(0); + + SDValue LHS = PeekSrc->getOperand(0); + SDValue Index = Src->getOperand(1); + return CurDAG->getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(Src), + Src.getValueType(), LHS, Index); + }; + // Recognise (xor a, 0x80000000) as NEG SrcMod. // Recognise (and a, 0x7fffffff) as ABS SrcMod. // Recognise (or a, 0x80000000) as NEG+ABS SrcModifiers. if (Opc == ISD::XOR && CRHS->getAPIntValue().isSignMask()) { Mods |= SISrcMods::NEG; - Src = Src.getOperand(0); + Src = ReplaceSrc(); } else if (Opc == ISD::AND && AllowAbs && CRHS->getAPIntValue().isMaxSignedValue()) { Mods |= SISrcMods::ABS; - Src = Src.getOperand(0); + Src = ReplaceSrc(); } else if (Opc == ISD::OR && AllowAbs && CRHS->getAPIntValue().isSignMask()) { Mods |= SISrcMods::ABS | SISrcMods::NEG; - Src = Src.getOperand(0); + Src = ReplaceSrc(); } return true; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 64e68ab7d753c..b80ce37d2d3c1 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -4048,6 +4048,59 @@ SDValue AMDGPUTargetLowering::splitBinaryBitConstantOpImpl( return DAG.getNode(ISD::BITCAST, SL, MVT::i64, Vec); } +// Part of the shift combines is to optimise for the case where its possible +// to reduce e.g shl64 to shl32 if shift range is [63-32]. This +// transforms: DST = shl i64 X, Y to [0, srl i32 X, (Y & 31) ]. The +// '&' is then elided by ISel. The vector code for this was being +// completely scalarised by the vector legalizer, but when v2i32 is +// legal the vector legaliser only partially scalarises the +// vector operations and the and is not elided. This function +// scalarises the AND for this optimisation case. +static SDValue getShiftForReduction(unsigned ShiftOpc, SDValue LHS, SDValue RHS, + SelectionDAG &DAG) { + assert( + (ShiftOpc == ISD::SRA || ShiftOpc == ISD::SRL || ShiftOpc == ISD::SHL) && + "Expected shift Opcode."); + + SDLoc SL = SDLoc(RHS); + if (RHS->getOpcode() != ISD::EXTRACT_VECTOR_ELT) + return SDValue(); + + SDValue VAND = RHS.getOperand(0); + if (VAND->getOpcode() != ISD::AND) + return SDValue(); + + ConstantSDNode *CRRHS = dyn_cast(RHS->getOperand(1)); + if (!CRRHS) + return SDValue(); + + SDValue LHSAND = VAND.getOperand(0); + SDValue RHSAND = VAND.getOperand(1); + if (RHSAND->getOpcode() != ISD::BUILD_VECTOR) + return SDValue(); + + ConstantSDNode *CANDL = dyn_cast(RHSAND->getOperand(0)); + ConstantSDNode *CANDR = dyn_cast(RHSAND->getOperand(1)); + if (!CANDL || !CANDR || RHSAND->getConstantOperandVal(0) != 0x1f || + RHSAND->getConstantOperandVal(1) != 0x1f) + return SDValue(); + // Get the non-const AND operands and produce scalar AND + const SDValue Zero = DAG.getConstant(0, SL, MVT::i32); + const SDValue One = DAG.getConstant(1, SL, MVT::i32); + SDValue Lo = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, Zero); + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, LHSAND, One); + SDValue AndMask = DAG.getConstant(0x1f, SL, MVT::i32); + SDValue LoAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Lo, AndMask); + SDValue HiAnd = DAG.getNode(ISD::AND, SL, MVT::i32, Hi, AndMask); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, LHS); + uint64_t AndIndex = RHS->getConstantOperandVal(1); + if (AndIndex == 0 || AndIndex == 1) + return DAG.getNode(ShiftOpc, SL, MVT::i32, Trunc, + AndIndex == 0 ? LoAnd : HiAnd, RHS->getFlags()); + + return SDValue(); +} + SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, DAGCombinerInfo &DCI) const { EVT VT = N->getValueType(0); @@ -4057,6 +4110,9 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, SDLoc SL(N); SelectionDAG &DAG = DCI.DAG; + if (SDValue SS = getShiftForReduction(ISD::SHL, LHS, RHS, DAG)) + return SS; + unsigned RHSVal; if (CRHS) { RHSVal = CRHS->getZExtValue(); @@ -4098,8 +4154,6 @@ SDValue AMDGPUTargetLowering::performShlCombine(SDNode *N, if (VT.getScalarType() != MVT::i64) return SDValue(); - // i64 (shl x, C) -> (build_pair 0, (shl x, C - 32)) - // On some subtargets, 64-bit shift is a quarter rate instruction. In the // common case, splitting this into a move and a 32-bit shift is faster and // the same code size. @@ -4159,6 +4213,9 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, SelectionDAG &DAG = DCI.DAG; SDLoc SL(N); + if (SDValue SS = getShiftForReduction(ISD::SRA, LHS, RHS, DAG)) + return SS; + if (VT.getScalarType() != MVT::i64) return SDValue(); @@ -4189,12 +4246,12 @@ SDValue AMDGPUTargetLowering::performSraCombine(SDNode *N, (ElementType.getSizeInBits() - 1)) { ShiftAmt = ShiftFullAmt; } else { - SDValue truncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); + SDValue TruncShiftAmt = DAG.getNode(ISD::TRUNCATE, SL, TargetType, RHS); const SDValue ShiftMask = DAG.getConstant(TargetScalarType.getSizeInBits() - 1, SL, TargetType); // This AND instruction will clamp out of bounds shift values. // It will also be removed during later instruction selection. - ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, truncShiftAmt, ShiftMask); + ShiftAmt = DAG.getNode(ISD::AND, SL, TargetType, TruncShiftAmt, ShiftMask); } EVT ConcatType; @@ -4261,6 +4318,9 @@ SDValue AMDGPUTargetLowering::performSrlCombine(SDNode *N, SDLoc SL(N); unsigned RHSVal; + if (SDValue SS = getShiftForReduction(ISD::SRL, LHS, RHS, DAG)) + return SS; + if (CRHS) { RHSVal = CRHS->getZExtValue(); diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp index 8f44c03d95b43..7efdef93c1237 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -440,6 +440,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, {MVT::v2i32, MVT::v2f32}, Legal); } + setOperationAction({ISD::AND, ISD::OR, ISD::XOR}, MVT::v2i32, Legal); + // Prevent SELECT v2i32 from being implemented with the above bitwise ops and + // instead lower to cndmask in SITargetLowering::LowerSELECT(). + setOperationAction(ISD::SELECT, MVT::v2i32, Custom); + // Enable MatchRotate to produce ISD::ROTR, which is later transformed to + // alignbit. + setOperationAction(ISD::ROTR, MVT::v2i32, Custom); + setOperationAction(ISD::BUILD_VECTOR, {MVT::v4f16, MVT::v4i16, MVT::v4bf16}, Custom); @@ -6183,6 +6191,20 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op, return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op), VT, OpLo, OpHi); } +// Enable lowering of ROTR for vxi32 types. This is a workaround for a +// regression whereby extra unnecessary instructions were added to codegen +// for rotr operations, casued by legalising v2i32 or. This resulted in extra +// instructions to extract the result from the vector. +SDValue SITargetLowering::lowerROTR(SDValue Op, SelectionDAG &DAG) const { + [[maybe_unused]] EVT VT = Op.getValueType(); + + assert((VT == MVT::v2i32 || VT == MVT::v4i32 || VT == MVT::v8i32 || + VT == MVT::v16i32) && + "Unexpected ValueType."); + + return DAG.UnrollVectorOp(Op.getNode()); +} + // Work around LegalizeDAG doing the wrong thing and fully scalarizing if the // wider vector type is legal. SDValue SITargetLowering::splitBinaryVectorOp(SDValue Op, @@ -6374,6 +6396,8 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { return lowerGET_FPENV(Op, DAG); case ISD::SET_FPENV: return lowerSET_FPENV(Op, DAG); + case ISD::ROTR: + return lowerROTR(Op, DAG); } return SDValue(); } @@ -13412,6 +13436,47 @@ SDValue SITargetLowering::performOrCombine(SDNode *N, } } + // Detect identity v2i32 OR and replace with identity source node. + // Specifically an Or that has operands constructed from the same source node + // via extract_vector_elt and build_vector. I.E. + // v2i32 or( + // v2i32 build_vector( + // i32 extract_elt(%IdentitySrc, 0), + // i32 0 + // ), + // v2i32 build_vector( + // i32 0, + // i32 extract_elt(%IdentitySrc, 1) + // ) ) + // => + // v2i32 %IdentitySrc + + if (VT == MVT::v2i32 && LHS->getOpcode() == ISD::BUILD_VECTOR && + RHS->getOpcode() == ISD::BUILD_VECTOR) { + + ConstantSDNode *LC = dyn_cast(LHS->getOperand(1)); + ConstantSDNode *RC = dyn_cast(RHS->getOperand(0)); + + // Test for and normalise build vectors. + if (LC && RC && LC->getZExtValue() == 0 && RC->getZExtValue() == 0) { + + // Get the extract_vector_element operands. + SDValue LEVE = LHS->getOperand(0); + SDValue REVE = RHS->getOperand(1); + + if (LEVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT && + REVE->getOpcode() == ISD::EXTRACT_VECTOR_ELT) { + // Check that different elements from the same vector are + // extracted. + if (LEVE->getOperand(0) == REVE->getOperand(0) && + LEVE->getOperand(1) != REVE->getOperand(1)) { + SDValue IdentitySrc = LEVE.getOperand(0); + return IdentitySrc; + } + } + } + } + if (VT != MVT::i64 || DCI.isBeforeLegalizeOps()) return SDValue(); @@ -13459,7 +13524,7 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, SDValue LHS = N->getOperand(0); SDValue RHS = N->getOperand(1); - const ConstantSDNode *CRHS = dyn_cast(RHS); + const ConstantSDNode *CRHS = isConstOrConstSplat(RHS); SelectionDAG &DAG = DCI.DAG; EVT VT = N->getValueType(0); @@ -13469,6 +13534,23 @@ SDValue SITargetLowering::performXorCombine(SDNode *N, return Split; } + // v2i32 (xor (vselect cc, x, y), K) -> + // (v2i32 svelect cc, (xor x, K), (xor y, K)) This enables the xor to be + // replaced with source modifiers when the select is lowered to CNDMASK. + unsigned Opc = LHS.getOpcode(); + if (((Opc == ISD::VSELECT && VT == MVT::v2i32) || + (Opc == ISD::SELECT && VT == MVT::i64)) && + CRHS && CRHS->getAPIntValue().isSignMask()) { + SDValue CC = LHS->getOperand(0); + SDValue TRUE = LHS->getOperand(1); + SDValue FALSE = LHS->getOperand(2); + SDValue XTrue = DAG.getNode(ISD::XOR, SDLoc(N), VT, TRUE, RHS); + SDValue XFalse = DAG.getNode(ISD::XOR, SDLoc(N), VT, FALSE, RHS); + SDValue XSelect = + DAG.getNode(ISD::VSELECT, SDLoc(N), VT, CC, XTrue, XFalse); + return XSelect; + } + // Make sure to apply the 64-bit constant splitting fold before trying to fold // fneg-like xors into 64-bit select. if (LHS.getOpcode() == ISD::SELECT && VT == MVT::i32) { diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h index dedd9ae170774..97955da4657e9 100644 --- a/llvm/lib/Target/AMDGPU/SIISelLowering.h +++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -444,6 +444,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue lowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const; SDValue lowerGET_FPENV(SDValue Op, SelectionDAG &DAG) const; SDValue lowerSET_FPENV(SDValue Op, SelectionDAG &DAG) const; + SDValue lowerROTR(SDValue Op, SelectionDAG &DAG) const; Register getRegisterByName(const char* RegName, LLT VT, const MachineFunction &MF) const override; diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td index bd5dfa92a8e43..aaf5e1ab1431a 100644 --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -2467,9 +2467,9 @@ def : AMDGPUPatIgnoreCopies < (COPY_TO_REGCLASS VSrc_b32:$z, VGPR_32)) >; -// 64-bit version +foreach vt = [i64, v2i32] in { def : AMDGPUPatIgnoreCopies < - (DivergentBinFrag i64:$z, (and i64:$x, (xor i64:$y, i64:$z))), + (DivergentBinFrag vt:$z, (and vt:$x, (xor vt:$y, vt:$z))), (REG_SEQUENCE VReg_64, (V_BFI_B32_e64 (i32 (EXTRACT_SUBREG VReg_64:$x, sub0)), (i32 (EXTRACT_SUBREG VReg_64:$y, sub0)), @@ -2478,6 +2478,7 @@ def : AMDGPUPatIgnoreCopies < (i32 (EXTRACT_SUBREG VReg_64:$y, sub1)), (i32 (EXTRACT_SUBREG VReg_64:$z, sub1))), sub1) >; +} def : AMDGPUPat < (fcopysign f32:$src0, f32:$src1), diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 431d73b9a95b5..592c48a2acec8 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1841,6 +1841,21 @@ def : GCNPat < (S_MOV_B32 imm:$imm) >; +def : GCNPat < + (v2i32 (UniformBinFrag v2i32:$x, v2i32:$y)), + (S_AND_B64 SReg_64:$x, SReg_64:$y) +>; + +def : GCNPat < + (v2i32 (UniformBinFrag v2i32:$x, v2i32:$y)), + (S_OR_B64 SReg_64:$x, SReg_64:$y) +>; + +def : GCNPat < + (v2i32 (UniformBinFrag v2i32:$x, v2i32:$y)), + (S_XOR_B64 SReg_64:$x, SReg_64:$y) +>; + // Same as a 32-bit inreg def : GCNPat< (i32 (UniformUnaryFrag i16:$src)), diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index 9de7d6d009fe1..a62125c37bf8b 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -1026,9 +1026,9 @@ def : DivergentClampingBinOp; def : DivergentBinOp; def : DivergentBinOp; -class divergent_i64_BinOp : +class divergent_i64_BinOp : GCNPat< - (DivergentBinFrag i64:$src0, i64:$src1), + (DivergentBinFrag vt:$src0, vt:$src1), (REG_SEQUENCE VReg_64, (Inst (i32 (EXTRACT_SUBREG $src0, sub0)), @@ -1045,6 +1045,10 @@ def : divergent_i64_BinOp ; def : divergent_i64_BinOp ; def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; +def : divergent_i64_BinOp ; + // mul24 w/ 64 bit output. class mul24_64_Pat : GCNPat< (i64 (Op i32:$src0, i32:$src1)), diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll index ca1e7c9b06653..29bfc253e2e7e 100644 --- a/llvm/test/CodeGen/AMDGPU/and.ll +++ b/llvm/test/CodeGen/AMDGPU/and.ll @@ -1,10 +1,42 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s -; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s -; RUN: llc -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s +; RUN: llc -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX6 %s +; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GFX8 %s + declare i32 @llvm.amdgcn.workitem.id.x() #0 +define amdgpu_ps <2 x i32> @s_and_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { +; GFX6-LABEL: s_and_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_and_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog + %result = and <2 x i32> %num, %den + ret <2 x i32> %result +} + +define <2 x i32> @v_and_v2i32(<2 x i32> %num, <2 x i32> %den) { +; GFX6-LABEL: v_and_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_and_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_and_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = and <2 x i32> %num, %den + ret <2 x i32> %result +} + define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: test2: ; GFX6: ; %bb.0: @@ -14,8 +46,7 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) -; GFX6-NEXT: s_and_b32 s5, s5, s7 -; GFX6-NEXT: s_and_b32 s4, s4, s6 +; GFX6-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 @@ -29,30 +60,11 @@ define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX8-NEXT: s_mov_b32 s3, 0xf000 ; GFX8-NEXT: s_mov_b32 s2, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: s_and_b32 s5, s5, s7 -; GFX8-NEXT: s_and_b32 s4, s4, s6 +; GFX8-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: test2: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: AND_INT * T0.Y, T0.Y, T0.W, -; EG-NEXT: AND_INT T0.X, T0.X, T0.Z, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -99,27 +111,6 @@ define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: test4: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT * T0.W, T0.W, T1.W, -; EG-NEXT: AND_INT * T0.Z, T0.Z, T1.Z, -; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: AND_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -154,17 +145,6 @@ define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T1.X, KC0[2].Z, KC0[2].W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i32 %a, %b store i32 %and, ptr addrspace(1) %out, align 4 ret void @@ -194,17 +174,6 @@ define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_constant_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T1.X, KC0[2].Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 1234567(1.729997e-39) %and = and i32 %a, 1234567 store i32 %and, ptr addrspace(1) %out, align 4 ret void @@ -252,20 +221,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_multi_use_constant_i32_0: -; EG: ; %bb.0: -; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: MOV T0.X, literal.x, -; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, PV.W, KC0[2].W, -; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i32 %a, 1234567 ; Just to stop future replacement of copy to vgpr + store with VALU op. @@ -309,20 +264,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_multi_use_constant_i32_1: -; EG: ; %bb.0: -; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, -; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[2].W, -; EG-NEXT: ADD_INT T0.X, PV.W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %and = and i32 %a, 1234567 %foo = add i32 %and, 1234567 %bar = add i32 %foo, %b @@ -371,28 +312,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_and_b32_e32 v2, v5, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_i32_vgpr_vgpr: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: AND_INT T0.X, T0.X, T1.X, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T1.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid @@ -440,26 +359,6 @@ define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, pt ; GFX8-NEXT: v_and_b32_e32 v2, s2, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_i32_sgpr_vgpr: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, KC0[2].Z, T0.X, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T1.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -504,26 +403,6 @@ define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrsp ; GFX8-NEXT: v_and_b32_e32 v2, s4, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_i32_vgpr_sgpr: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, KC0[2].W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, -; EG-NEXT: LSHR * T1.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid @@ -568,25 +447,6 @@ define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_constant_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %a = load i32, ptr addrspace(1) %gep, align 4 @@ -630,25 +490,6 @@ define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr ad ; GFX8-NEXT: v_and_b32_e32 v0, 64, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_inline_imm_64_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %a = load i32, ptr addrspace(1) %gep, align 4 @@ -692,25 +533,6 @@ define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, pt ; GFX8-NEXT: v_and_b32_e32 v0, -16, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_inline_imm_neg_16_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -16(nan), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid %a = load i32, ptr addrspace(1) %gep, align 4 @@ -749,18 +571,6 @@ define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, KC0[3].Z, -; EG-NEXT: AND_INT * T0.X, KC0[2].W, KC0[3].Y, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, %b store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -794,34 +604,6 @@ define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) { ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_byte v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_i1: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3 -; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, 0.0, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T1.W, T1.X, T0.X, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T1.W, PS, 1, -; EG-NEXT: LSHL * T0.W, PV.W, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T0.X, PV.W, PS, -; EG-NEXT: LSHL * T0.W, literal.x, PS, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: MOV * T0.Z, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i1 %a, %b store i1 %and, ptr addrspace(1) %out ret void @@ -857,19 +639,6 @@ define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) { ; GFX8-NEXT: v_mov_b32_e32 v1, s0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 524288(7.346840e-40), 2(2.802597e-45) %and = and i64 %a, 549756338176 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -921,27 +690,6 @@ define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_multi_use_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: AND_INT T0.X, KC0[3].Y, literal.x, -; EG-NEXT: AND_INT * T1.X, KC0[3].Z, literal.y, -; EG-NEXT: 524288(7.346840e-40), 128(1.793662e-43) -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: LSHR T2.X, PV.W, literal.x, -; EG-NEXT: AND_INT * T3.X, KC0[3].X, literal.y, -; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) -; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, -; EG-NEXT: AND_INT * T5.X, KC0[2].W, literal.y, -; EG-NEXT: 2(2.802597e-45), 524288(7.346840e-40) %and0 = and i64 %a, 549756338176 %and1 = and i64 %b, 549756338176 store volatile i64 %and0, ptr addrspace(1) %out @@ -975,18 +723,6 @@ define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_32_bit_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %and = and i64 %a, 1234567 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1046,34 +782,6 @@ define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_multi_use_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: LSHL T0.W, KC0[3].W, 1, -; EG-NEXT: LSHL * T1.W, KC0[2].W, 1, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, PV.W, KC0[4].W, -; EG-NEXT: AND_INT T1.W, T1.W, literal.x, -; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, -; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, KC0[5].X, PS, -; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, -; EG-NEXT: ADD_INT T2.X, KC0[5].X, PV.W, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, -; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) -; EG-NEXT: LSHR T3.X, PV.W, literal.x, -; EG-NEXT: ADD_INT * T4.X, T1.W, KC0[4].W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %shl.a = shl i64 %a, 1 %shl.b = shl i64 %b, 1 %and0 = and i64 %shl.a, 62 @@ -1130,28 +838,6 @@ define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %ap ; GFX8-NEXT: v_and_b32_e32 v0, v0, v2 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, -; EG-NEXT: ALU clause starting at 14: -; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: AND_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1199,27 +885,6 @@ define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspa ; GFX8-NEXT: v_and_b32_e32 v0, 0xab19b207, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT * T0.Y, T0.Y, literal.x, -; EG-NEXT: 286(4.007714e-43), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1280,47 +945,6 @@ define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: buffer_store_dwordx2 v[2:3], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_multi_use_constant_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @12 -; EG-NEXT: ALU 0, @22, KC0[], KC1[] -; EG-NEXT: TEX 0 @14 -; EG-NEXT: ALU 0, @23, KC0[], KC1[] -; EG-NEXT: TEX 1 @16 -; EG-NEXT: ALU 10, @24, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T5.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T5.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 -; EG-NEXT: Fetch clause starting at 14: -; EG-NEXT: VTX_READ_32 T2.X, T2.X, 0, #1 -; EG-NEXT: Fetch clause starting at 16: -; EG-NEXT: VTX_READ_32 T3.X, T3.X, 4, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 20: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, PV.X, -; EG-NEXT: ALU clause starting at 22: -; EG-NEXT: MOV * T2.X, T0.X, -; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV * T3.X, T0.X, -; EG-NEXT: ALU clause starting at 24: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: AND_INT * T3.X, T3.X, literal.y, -; EG-NEXT: -1424379385(-5.460358e-13), 286(4.007714e-43) -; EG-NEXT: AND_INT T2.X, T2.X, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) -; EG-NEXT: AND_INT T1.X, T1.X, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 286(4.007714e-43), 4(5.605194e-45) -; EG-NEXT: LSHR * T5.X, PV.W, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load volatile i64, ptr addrspace(1) %aptr %b = load volatile i64, ptr addrspace(1) %aptr %and0 = and i64 %a, 1231231234567 @@ -1382,45 +1006,6 @@ define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ; GFX8-NEXT: buffer_store_dwordx2 v[4:5], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_multi_use_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @12 -; EG-NEXT: ALU 0, @22, KC0[], KC1[] -; EG-NEXT: TEX 0 @14 -; EG-NEXT: ALU 0, @23, KC0[], KC1[] -; EG-NEXT: TEX 1 @16 -; EG-NEXT: ALU 8, @24, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: Fetch clause starting at 12: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 -; EG-NEXT: Fetch clause starting at 14: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 -; EG-NEXT: Fetch clause starting at 16: -; EG-NEXT: VTX_READ_32 T2.X, T2.X, 4, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 20: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, PV.X, -; EG-NEXT: ALU clause starting at 22: -; EG-NEXT: MOV * T1.X, T0.X, -; EG-NEXT: ALU clause starting at 23: -; EG-NEXT: MOV * T2.X, T0.X, -; EG-NEXT: ALU clause starting at 24: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: AND_INT * T1.X, T1.X, literal.x, -; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) -; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, -; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) -; EG-NEXT: LSHR T3.X, PV.W, literal.x, -; EG-NEXT: MOV * T4.X, literal.y, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load volatile i64, ptr addrspace(1) %aptr %b = load volatile i64, ptr addrspace(1) %aptr %and0 = and i64 %a, 63 @@ -1466,26 +1051,6 @@ define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_and_b32_e32 v0, 0x12d687, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_i64_32_bit_constant: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1530,26 +1095,6 @@ define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_and_b32_e32 v0, 64, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1595,25 +1140,6 @@ define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_and_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: v_and_inline_neg_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: LSHL * T0.W, T0.X, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: AND_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -8(nan), 2(2.802597e-45) %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid %a = load i64, ptr addrspace(1) %gep.a, align 8 @@ -1648,18 +1174,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr ad ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_64_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) %and = and i64 %a, 64 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1699,22 +1213,6 @@ define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_64_i64_noshrink: -; EG: ; %bb.0: -; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHL * T0.W, KC0[2].W, 1, -; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, -; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, PV.W, KC0[3].W, -; EG-NEXT: ADDC_UINT T0.W, PV.W, KC0[3].W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: ADD_INT * T0.Y, KC0[4].X, PV.W, %shl = shl i64 %a, 1 %and = and i64 %shl, 64 %add = add i64 %and, %b @@ -1748,18 +1246,6 @@ define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr add ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_1_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, 1, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 1 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1791,19 +1277,6 @@ define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_1.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1072693248(1.875000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4607182418800017408 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1835,19 +1308,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_neg_1.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1074790400(-1.875000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13830554455654793216 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1879,19 +1339,6 @@ define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_0.5_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1071644672(1.750000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4602678819172646912 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1923,19 +1370,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_neg_0.5_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1075838976(-1.750000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13826050856027422720 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -1967,19 +1401,6 @@ define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_2.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4611686018427387904 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2011,19 +1432,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_neg_2.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1073741824(-2.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13835058055282163712 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2055,19 +1463,6 @@ define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr a ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1074790400(2.250000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4616189618054758400 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2099,19 +1494,6 @@ define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_neg_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1072693248(-2.250000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13839561654909534208 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2146,18 +1528,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, p ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_f32_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 1082130432(4.000000e+00), 2(2.802597e-45) %and = and i64 %a, 1082130432 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2189,18 +1559,6 @@ define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %ou ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: MOV * T0.Y, KC0[3].X, -; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -1065353216(-4.000000e+00), 2(2.802597e-45) %and = and i64 %a, -1065353216 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2234,19 +1592,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %o ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 4647714815446351872 store i64 %and, ptr addrspace(1) %out, align 8 ret void @@ -2278,19 +1623,6 @@ define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, -; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV T0.X, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %and = and i64 %a, 13871086852301127680 store i64 %and, ptr addrspace(1) %out, align 8 ret void diff --git a/llvm/test/CodeGen/AMDGPU/and.r600.ll b/llvm/test/CodeGen/AMDGPU/and.r600.ll new file mode 100644 index 0000000000000..590b1ac899fcf --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/and.r600.ll @@ -0,0 +1,987 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + + +define amdgpu_kernel void @test2(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: test2: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: AND_INT * T0.Y, T0.Y, T0.W, +; EG-NEXT: AND_INT T0.X, T0.X, T0.Z, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <2 x i32>, ptr addrspace(1) %in + %b = load <2 x i32>, ptr addrspace(1) %b_ptr + %result = and <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @test4(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: test4: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT * T0.W, T0.W, T1.W, +; EG-NEXT: AND_INT * T0.Z, T0.Z, T1.Z, +; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: AND_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <4 x i32>, ptr addrspace(1) %in + %b = load <4 x i32>, ptr addrspace(1) %b_ptr + %result = and <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_and_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: s_and_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[2].Z, KC0[2].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @s_and_constant_i32(ptr addrspace(1) %out, i32 %a) { +; EG-LABEL: s_and_constant_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[2].Z, literal.y, +; EG-NEXT: 2(2.802597e-45), 1234567(1.729997e-39) + %and = and i32 %a, 1234567 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +; FIXME: We should really duplicate the constant so that the SALU use +; can fold into the s_and_b32 and the VALU one is materialized +; directly without copying from the SGPR. + +; Second use is a VGPR use of the constant. + +define amdgpu_kernel void @s_and_multi_use_constant_i32_0(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: s_and_multi_use_constant_i32_0: +; EG: ; %bb.0: +; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV T0.X, literal.x, +; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, PV.W, KC0[2].W, +; EG-NEXT: LSHR * T2.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i32 %a, 1234567 + + ; Just to stop future replacement of copy to vgpr + store with VALU op. + %foo = add i32 %and, %b + store volatile i32 %foo, ptr addrspace(1) %out + store volatile i32 1234567, ptr addrspace(1) %out + ret void +} + +; Second use is another SGPR use of the constant. + +define amdgpu_kernel void @s_and_multi_use_constant_i32_1(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: s_and_multi_use_constant_i32_1: +; EG: ; %bb.0: +; EG-NEXT: ALU 5, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.W, KC0[2].Z, literal.x, +; EG-NEXT: 1234567(1.729997e-39), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.W, PV.W, KC0[2].W, +; EG-NEXT: ADD_INT T0.X, PV.W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %and = and i32 %a, 1234567 + %foo = add i32 %and, 1234567 + %bar = add i32 %foo, %b + store volatile i32 %bar, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_i32_vgpr_vgpr(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { +; EG-LABEL: v_and_i32_vgpr_vgpr: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: AND_INT T0.X, T0.X, T1.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid + %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid + %a = load i32, ptr addrspace(1) %gep.a + %b = load i32, ptr addrspace(1) %gep.b + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @v_and_i32_sgpr_vgpr(ptr addrspace(1) %out, i32 %a, ptr addrspace(1) %bptr) { +; EG-LABEL: v_and_i32_sgpr_vgpr: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, KC0[2].Z, T0.X, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.b = getelementptr i32, ptr addrspace(1) %bptr, i32 %tid + %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid + %b = load i32, ptr addrspace(1) %gep.b + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @v_and_i32_vgpr_sgpr(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i32 %b) { +; EG-LABEL: v_and_i32_vgpr_sgpr: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, KC0[2].W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, T0.W, +; EG-NEXT: LSHR * T1.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %gep.out = getelementptr i32, ptr addrspace(1) %out, i32 %tid + %a = load i32, ptr addrspace(1) %gep.a + %and = and i32 %a, %b + store i32 %and, ptr addrspace(1) %gep.out + ret void +} + +define amdgpu_kernel void @v_and_constant_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_constant_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %a = load i32, ptr addrspace(1) %gep, align 4 + %and = and i32 %a, 1234567 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v_and_inline_imm_64_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_imm_64_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %a = load i32, ptr addrspace(1) %gep, align 4 + %and = and i32 %a, 64 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @v_and_inline_imm_neg_16_i32(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_imm_neg_16_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -16(nan), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep = getelementptr i32, ptr addrspace(1) %aptr, i32 %tid + %a = load i32, ptr addrspace(1) %gep, align 4 + %and = and i32 %a, -16 + store i32 %and, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: s_and_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, KC0[3].Z, +; EG-NEXT: AND_INT * T0.X, KC0[2].W, KC0[3].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, %b + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) { +; EG-LABEL: s_and_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 12, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_8 T1.X, T0.X, 40, #3 +; EG-NEXT: VTX_READ_8 T0.X, T0.X, 41, #3 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, 0.0, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.W, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T1.W, T1.X, T0.X, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T1.W, PS, 1, +; EG-NEXT: LSHL * T0.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i1 %a, %b + store i1 %and, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_and_constant_i64(ptr addrspace(1) %out, i64 %a) { +; EG-LABEL: s_and_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 128(1.793662e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 524288(7.346840e-40), 2(2.802597e-45) + %and = and i64 %a, 549756338176 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_multi_use_constant_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: s_and_multi_use_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 10, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T5.X, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: AND_INT T0.X, KC0[3].Y, literal.x, +; EG-NEXT: AND_INT * T1.X, KC0[3].Z, literal.y, +; EG-NEXT: 524288(7.346840e-40), 128(1.793662e-43) +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, PV.W, literal.x, +; EG-NEXT: AND_INT * T3.X, KC0[3].X, literal.y, +; EG-NEXT: 2(2.802597e-45), 128(1.793662e-43) +; EG-NEXT: LSHR T4.X, KC0[2].Y, literal.x, +; EG-NEXT: AND_INT * T5.X, KC0[2].W, literal.y, +; EG-NEXT: 2(2.802597e-45), 524288(7.346840e-40) + %and0 = and i64 %a, 549756338176 + %and1 = and i64 %b, 549756338176 + store volatile i64 %and0, ptr addrspace(1) %out + store volatile i64 %and1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_and_32_bit_constant_i64(ptr addrspace(1) %out, i32, i64 %a) { +; EG-LABEL: s_and_32_bit_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %and = and i64 %a, 1234567 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, i32, i64 %a, i32, i64 %b, i32, i64 %c) { +; EG-LABEL: s_and_multi_use_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 17, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T5.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: LSHL T0.W, KC0[3].W, 1, +; EG-NEXT: LSHL * T1.W, KC0[2].W, 1, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, PV.W, KC0[4].W, +; EG-NEXT: AND_INT T1.W, T1.W, literal.x, +; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, +; EG-NEXT: 62(8.688050e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, KC0[5].X, PS, +; EG-NEXT: ADDC_UINT * T0.W, PV.W, KC0[4].W, +; EG-NEXT: ADD_INT T2.X, KC0[5].X, PV.W, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.x, +; EG-NEXT: 4(5.605194e-45), 0(0.000000e+00) +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: ADD_INT * T4.X, T1.W, KC0[4].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: LSHR * T5.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %shl.a = shl i64 %a, 1 + %shl.b = shl i64 %b, 1 + %and0 = and i64 %shl.a, 62 + %and1 = and i64 %shl.b, 62 + %add0 = add i64 %and0, %c + %add1 = add i64 %and1, %c + store volatile i64 %add0, ptr addrspace(1) %out + store volatile i64 %add1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) { +; EG-LABEL: v_and_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 3, @14, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ADD_INT * T1.X, KC0[2].W, PV.W, +; EG-NEXT: ALU clause starting at 14: +; EG-NEXT: AND_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: AND_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %gep.b = getelementptr i64, ptr addrspace(1) %bptr, i32 %tid + %b = load i64, ptr addrspace(1) %gep.b, align 8 + %and = and i64 %a, %b + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_and_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT * T0.Y, T0.Y, literal.x, +; EG-NEXT: 286(4.007714e-43), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, 1231231234567 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_and_multi_use_constant_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_multi_use_constant_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @12 +; EG-NEXT: ALU 0, @22, KC0[], KC1[] +; EG-NEXT: TEX 0 @14 +; EG-NEXT: ALU 0, @23, KC0[], KC1[] +; EG-NEXT: TEX 1 @16 +; EG-NEXT: ALU 10, @24, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.X, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.X, T5.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T4.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_32 T2.X, T2.X, 0, #1 +; EG-NEXT: Fetch clause starting at 16: +; EG-NEXT: VTX_READ_32 T3.X, T3.X, 4, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 20: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, PV.X, +; EG-NEXT: ALU clause starting at 22: +; EG-NEXT: MOV * T2.X, T0.X, +; EG-NEXT: ALU clause starting at 23: +; EG-NEXT: MOV * T3.X, T0.X, +; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: AND_INT * T3.X, T3.X, literal.y, +; EG-NEXT: -1424379385(-5.460358e-13), 286(4.007714e-43) +; EG-NEXT: AND_INT T2.X, T2.X, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: -1424379385(-5.460358e-13), 2(2.802597e-45) +; EG-NEXT: AND_INT T1.X, T1.X, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 286(4.007714e-43), 4(5.605194e-45) +; EG-NEXT: LSHR * T5.X, PV.W, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load volatile i64, ptr addrspace(1) %aptr + %b = load volatile i64, ptr addrspace(1) %aptr + %and0 = and i64 %a, 1231231234567 + %and1 = and i64 %b, 1231231234567 + store volatile i64 %and0, ptr addrspace(1) %out + store volatile i64 %and1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_multi_use_inline_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_multi_use_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @20, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @12 +; EG-NEXT: ALU 0, @22, KC0[], KC1[] +; EG-NEXT: TEX 0 @14 +; EG-NEXT: ALU 0, @23, KC0[], KC1[] +; EG-NEXT: TEX 1 @16 +; EG-NEXT: ALU 8, @24, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T4.X, T3.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: Fetch clause starting at 12: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 4, #1 +; EG-NEXT: Fetch clause starting at 14: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: Fetch clause starting at 16: +; EG-NEXT: VTX_READ_32 T2.X, T2.X, 4, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 20: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, PV.X, +; EG-NEXT: ALU clause starting at 22: +; EG-NEXT: MOV * T1.X, T0.X, +; EG-NEXT: ALU clause starting at 23: +; EG-NEXT: MOV * T2.X, T0.X, +; EG-NEXT: ALU clause starting at 24: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: AND_INT * T1.X, T1.X, literal.x, +; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) +; EG-NEXT: LSHR T2.X, KC0[2].Y, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[2].Y, literal.y, +; EG-NEXT: 2(2.802597e-45), 4(5.605194e-45) +; EG-NEXT: LSHR T3.X, PV.W, literal.x, +; EG-NEXT: MOV * T4.X, literal.y, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load volatile i64, ptr addrspace(1) %aptr + %b = load volatile i64, ptr addrspace(1) %aptr + %and0 = and i64 %a, 63 + %and1 = and i64 %b, 63 + store volatile i64 %and0, ptr addrspace(1) %out + store volatile i64 %and1, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @v_and_i64_32_bit_constant(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_i64_32_bit_constant: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1234567(1.729997e-39), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, 1234567 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @v_and_inline_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, 64 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +; FIXME: Should be able to reduce load width + +define amdgpu_kernel void @v_and_inline_neg_imm_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr) { +; EG-LABEL: v_and_inline_neg_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: LSHL * T0.W, T0.X, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.X, KC0[2].Z, PV.W, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: AND_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -8(nan), 2(2.802597e-45) + %tid = call i32 @llvm.amdgcn.workitem.id.x() #0 + %gep.a = getelementptr i64, ptr addrspace(1) %aptr, i32 %tid + %a = load i64, ptr addrspace(1) %gep.a, align 8 + %and = and i64 %a, -8 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_64_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_64_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 64(8.968310e-44), 2(2.802597e-45) + %and = and i64 %a, 64 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_64_i64_noshrink(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a, i32, i64 %b) { +; EG-LABEL: s_and_inline_imm_64_i64_noshrink: +; EG: ; %bb.0: +; EG-NEXT: ALU 7, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHL * T0.W, KC0[2].W, 1, +; EG-NEXT: AND_INT * T0.W, PV.W, literal.x, +; EG-NEXT: 64(8.968310e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, PV.W, KC0[3].W, +; EG-NEXT: ADDC_UINT T0.W, PV.W, KC0[3].W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: ADD_INT * T0.Y, KC0[4].X, PV.W, + %shl = shl i64 %a, 1 + %and = and i64 %shl, 64 + %add = add i64 %and, %b + store i64 %add, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_1_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_1_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, 1, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 1 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_1.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1072693248(1.875000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4607182418800017408 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_1.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_1.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1074790400(-1.875000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13830554455654793216 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_0.5_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1071644672(1.750000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4602678819172646912 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_0.5_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_0.5_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1075838976(-1.750000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13826050856027422720 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_2.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1073741824(2.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4611686018427387904 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_2.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_2.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1073741824(-2.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13835058055282163712 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1074790400(2.250000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4616189618054758400 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_neg_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1072693248(-2.250000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13839561654909534208 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +; Test with the 64-bit integer bitpattern for a 32-bit float in the +; low 32-bits, which is not a valid 64-bit inline immmediate. + +define amdgpu_kernel void @s_and_inline_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_f32_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 1082130432(4.000000e+00), 2(2.802597e-45) + %and = and i64 %a, 1082130432 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_imm_f32_neg_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV * T0.Y, KC0[3].X, +; EG-NEXT: AND_INT T0.X, KC0[2].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -1065353216(-4.000000e+00), 2(2.802597e-45) + %and = and i64 %a, -1065353216 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +; Shift into upper 32-bits + +define amdgpu_kernel void @s_and_inline_high_imm_f32_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_high_imm_f32_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: 1082130432(4.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 4647714815446351872 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @s_and_inline_high_imm_f32_neg_4.0_i64(ptr addrspace(1) %out, ptr addrspace(1) %aptr, i64 %a) { +; EG-LABEL: s_and_inline_high_imm_f32_neg_4.0_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: AND_INT * T0.Y, KC0[3].X, literal.x, +; EG-NEXT: -1065353216(-4.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV T0.X, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %and = and i64 %a, 13871086852301127680 + store i64 %and, ptr addrspace(1) %out, align 8 + ret void +} +attributes #0 = { nounwind readnone } diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll index 752a87ac3cb73..6740e554568d0 100644 --- a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll @@ -181,25 +181,25 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX-950-LABEL: v_test_cvt_v2f64_v2bf16_v: ; GFX-950: ; %bb.0: ; GFX-950-NEXT: v_cvt_f32_f64_e32 v6, v[2:3] +; GFX-950-NEXT: v_and_b32_e32 v4, 1, v6 +; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4 ; GFX-950-NEXT: v_cvt_f64_f32_e32 v[4:5], v6 -; GFX-950-NEXT: v_and_b32_e32 v7, 1, v6 ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, |v[4:5]| -; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[2:3], v[4:5] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v7 +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[2:3], v[4:5] +; GFX-950-NEXT: v_cvt_f32_f64_e32 v7, v[0:1] ; GFX-950-NEXT: v_cndmask_b32_e64 v2, -1, 1, s[2:3] ; GFX-950-NEXT: v_add_u32_e32 v2, v6, v2 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-950-NEXT: v_cvt_f32_f64_e32 v5, v[0:1] +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] ; GFX-950-NEXT: v_cndmask_b32_e32 v4, v2, v6, vcc -; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v5 -; GFX-950-NEXT: v_and_b32_e32 v6, 1, v5 +; GFX-950-NEXT: v_cvt_f64_f32_e32 v[2:3], v7 +; GFX-950-NEXT: v_and_b32_e32 v8, 1, v7 ; GFX-950-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, |v[2:3]| -; GFX-950-NEXT: v_cmp_nlg_f64_e32 vcc, v[0:1], v[2:3] -; GFX-950-NEXT: v_cmp_eq_u32_e64 s[0:1], 1, v6 +; GFX-950-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8 +; GFX-950-NEXT: v_cmp_nlg_f64_e64 s[0:1], v[0:1], v[2:3] ; GFX-950-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3] -; GFX-950-NEXT: v_add_u32_e32 v0, v5, v0 -; GFX-950-NEXT: s_or_b64 vcc, s[0:1], vcc -; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc +; GFX-950-NEXT: v_add_u32_e32 v0, v7, v0 +; GFX-950-NEXT: s_or_b64 vcc, vcc, s[0:1] +; GFX-950-NEXT: v_cndmask_b32_e32 v0, v0, v7, vcc ; GFX-950-NEXT: v_cvt_pk_bf16_f32 v0, v0, v4 ; GFX-950-NEXT: ; return to shader part epilog ; @@ -221,10 +221,10 @@ define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) { ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_cndmask_b32_e64 v0, -1, 1, s1 ; GFX1250-NEXT: v_and_b32_e32 v11, 1, v9 -; GFX1250-NEXT: v_cmp_eq_u32_e64 s1, 1, v10 +; GFX1250-NEXT: v_cmp_ne_u32_e64 s1, 0, v10 ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX1250-NEXT: v_add_nc_u32_e32 v0, v9, v0 -; GFX1250-NEXT: v_cmp_eq_u32_e64 s2, 1, v11 +; GFX1250-NEXT: v_cmp_ne_u32_e64 s2, 0, v11 ; GFX1250-NEXT: s_or_b32 vcc_lo, s1, vcc_lo ; GFX1250-NEXT: v_cndmask_b32_e32 v1, v1, v8, vcc_lo ; GFX1250-NEXT: s_or_b32 vcc_lo, s2, s0 @@ -344,7 +344,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm entry: %a.cvt = fptrunc float %a to bfloat @@ -380,7 +380,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, |v0|, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm entry: %a.abs = call float @llvm.fabs.f32(float %a) @@ -417,7 +417,7 @@ define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) { ; GFX1250: ; %bb.0: ; %entry ; GFX1250-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v1 ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, -v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm entry: %a.neg = fneg float %a @@ -480,7 +480,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) { ; GFX1250-NEXT: s_or_b32 vcc_lo, vcc_lo, s0 ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm entry: %a.cvt = fptrunc double %a to bfloat @@ -543,7 +543,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) { ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm entry: %a.neg = fneg double %a @@ -607,7 +607,7 @@ define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) { ; GFX1250-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX1250-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX1250-NEXT: v_cvt_pk_bf16_f32 v0, v0, s0 -; GFX1250-NEXT: flat_store_b16 v[2:3], v0 +; GFX1250-NEXT: flat_store_b16 v[2:3], v0 scope:SCOPE_SE ; GFX1250-NEXT: s_endpgm entry: %a.abs = call double @llvm.fabs.f64(double %a) diff --git a/llvm/test/CodeGen/AMDGPU/bfi_int.ll b/llvm/test/CodeGen/AMDGPU/bfi_int.ll index 628301b8f8e7e..8f567b6b87322 100644 --- a/llvm/test/CodeGen/AMDGPU/bfi_int.ll +++ b/llvm/test/CodeGen/AMDGPU/bfi_int.ll @@ -582,15 +582,15 @@ define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> % ; GFX7-LABEL: v_bitselect_v2i32_pat1: ; GFX7: ; %bb.0: ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX7-NEXT: v_bfi_b32 v1, v3, v1, v5 +; GFX7-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: v_bitselect_v2i32_pat1: ; GFX8: ; %bb.0: ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX8-NEXT: v_bfi_b32 v1, v3, v1, v5 +; GFX8-NEXT: v_bfi_b32 v0, v2, v0, v4 ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: v_bitselect_v2i32_pat1: diff --git a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll index 021104114d796..f5227eed458d6 100644 --- a/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll +++ b/llvm/test/CodeGen/AMDGPU/copysign-simplify-demanded-bits.ll @@ -31,8 +31,8 @@ define <2 x half> @test_pown_reduced_fast_v2f16_known_odd(<2 x half> %x, <2 x i3 ; GFX9-LABEL: test_pown_reduced_fast_v2f16_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_or_b32_e32 v2, 1, v2 +; GFX9-NEXT: v_or_b32_e32 v1, 1, v1 ; GFX9-NEXT: v_cvt_f32_i32_e32 v2, v2 ; GFX9-NEXT: v_cvt_f32_i32_e32 v1, v1 ; GFX9-NEXT: v_and_b32_e32 v3, 0x7fff7fff, v0 @@ -126,8 +126,8 @@ define <2 x double> @test_pown_reduced_fast_v2f64_known_odd(<2 x double> %x, <2 ; GFX9-LABEL: test_pown_reduced_fast_v2f64_known_odd: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_or_b32_e32 v6, 1, v5 +; GFX9-NEXT: v_or_b32_e32 v4, 1, v4 ; GFX9-NEXT: v_cvt_f64_i32_e32 v[4:5], v4 ; GFX9-NEXT: v_cvt_f64_i32_e32 v[6:7], v6 ; GFX9-NEXT: s_brev_b32 s4, -2 diff --git a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll index d63a36c4b2958..7e2e8b577e085 100644 --- a/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-preserve-disjoint-flag.ll @@ -28,12 +28,15 @@ define amdgpu_ps <2 x i32> @s_or_v2i32_disjoint(<2 x i32> inreg %a, <2 x i32> in ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr0 - ; CHECK-NEXT: [[S_OR_B32_:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY2]], [[COPY]], implicit-def dead $scc - ; CHECK-NEXT: [[S_OR_B32_1:%[0-9]+]]:sreg_32 = disjoint S_OR_B32 [[COPY3]], [[COPY1]], implicit-def dead $scc - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_1]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY4]], implicit $exec - ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[S_OR_B32_]] - ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[S_OR_B64_:%[0-9]+]]:sreg_64 = disjoint S_OR_B64 killed [[REG_SEQUENCE1]], killed [[REG_SEQUENCE]], implicit-def dead $scc + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub0 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[COPY4]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY5]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:sreg_32 = COPY [[S_OR_B64_]].sub1 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY6]] + ; CHECK-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sreg_32_xm0 = V_READFIRSTLANE_B32 killed [[COPY7]], implicit $exec ; CHECK-NEXT: $sgpr0 = COPY [[V_READFIRSTLANE_B32_]] ; CHECK-NEXT: $sgpr1 = COPY [[V_READFIRSTLANE_B32_1]] ; CHECK-NEXT: SI_RETURN_TO_EPILOG $sgpr0, $sgpr1 @@ -64,10 +67,23 @@ define <2 x i32> @v_or_v2i32_disjoint(<2 x i32> %a, <2 x i32> %b) { ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2 ; CHECK-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1 ; CHECK-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0 - ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY3]], [[COPY1]], implicit $exec - ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = disjoint V_OR_B32_e64 [[COPY2]], [[COPY]], implicit $exec - ; CHECK-NEXT: $vgpr0 = COPY [[V_OR_B32_e64_]] - ; CHECK-NEXT: $vgpr1 = COPY [[V_OR_B32_e64_1]] + ; CHECK-NEXT: [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1 + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF + ; CHECK-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1 + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub1 + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub1 + ; CHECK-NEXT: [[V_OR_B32_e64_:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY5]], killed [[COPY4]], implicit $exec + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE1]].sub0 + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE]].sub0 + ; CHECK-NEXT: [[V_OR_B32_e64_1:%[0-9]+]]:vgpr_32 = V_OR_B32_e64 killed [[COPY7]], killed [[COPY6]], implicit $exec + ; CHECK-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE killed [[V_OR_B32_e64_1]], %subreg.sub0, killed [[V_OR_B32_e64_]], %subreg.sub1 + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub0 + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY [[REG_SEQUENCE2]].sub1 + ; CHECK-NEXT: $vgpr0 = COPY [[COPY8]] + ; CHECK-NEXT: $vgpr1 = COPY [[COPY9]] ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1 %result = or disjoint <2 x i32> %a, %b ret <2 x i32> %result diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll index b35b55338e5fe..ef68f44bac203 100644 --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -2010,61 +2010,61 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; SI-LABEL: v_fshr_v2i24: ; SI: ; %bb.0: ; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; SI-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; SI-NEXT: s_mov_b32 s4, 0xaaaaaab -; SI-NEXT: v_mul_hi_u32 v6, v6, s4 -; SI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; SI-NEXT: v_mul_hi_u32 v6, v4, s4 +; SI-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; SI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; SI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; SI-NEXT: v_sub_i32_e32 v4, vcc, v4, v6 -; SI-NEXT: v_mul_hi_u32 v6, v7, s4 +; SI-NEXT: v_mul_hi_u32 v6, v5, s4 ; SI-NEXT: v_add_i32_e32 v4, vcc, 8, v4 ; SI-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; SI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; SI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 -; SI-NEXT: v_sub_i32_e32 v3, vcc, v5, v3 -; SI-NEXT: v_add_i32_e32 v3, vcc, 8, v3 -; SI-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; SI-NEXT: v_mul_u32_u24_e32 v2, 24, v6 +; SI-NEXT: v_sub_i32_e32 v2, vcc, v5, v2 +; SI-NEXT: v_add_i32_e32 v2, vcc, 8, v2 +; SI-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; SI-NEXT: s_setpc_b64 s[30:31] ; ; VI-LABEL: v_fshr_v2i24: ; VI: ; %bb.0: ; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; VI-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; VI-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; VI-NEXT: s_mov_b32 s4, 0xaaaaaab -; VI-NEXT: v_mul_hi_u32 v6, v6, s4 -; VI-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; VI-NEXT: v_mul_hi_u32 v6, v4, s4 +; VI-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; VI-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; VI-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; VI-NEXT: v_sub_u32_e32 v4, vcc, v4, v6 -; VI-NEXT: v_mul_hi_u32 v6, v7, s4 +; VI-NEXT: v_mul_hi_u32 v6, v5, s4 ; VI-NEXT: v_add_u32_e32 v4, vcc, 8, v4 ; VI-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; VI-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; VI-NEXT: v_mul_u32_u24_e32 v3, 24, v6 -; VI-NEXT: v_sub_u32_e32 v3, vcc, v5, v3 -; VI-NEXT: v_add_u32_e32 v3, vcc, 8, v3 -; VI-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; VI-NEXT: v_mul_u32_u24_e32 v2, 24, v6 +; VI-NEXT: v_sub_u32_e32 v2, vcc, v5, v2 +; VI-NEXT: v_add_u32_e32 v2, vcc, 8, v2 +; VI-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; VI-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: v_fshr_v2i24: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: v_and_b32_e32 v6, 0xffffff, v4 +; GFX9-NEXT: v_and_b32_e32 v4, 0xffffff, v4 ; GFX9-NEXT: s_mov_b32 s4, 0xaaaaaab -; GFX9-NEXT: v_mul_hi_u32 v6, v6, s4 -; GFX9-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX9-NEXT: v_mul_hi_u32 v6, v4, s4 +; GFX9-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v3, 8, v3 ; GFX9-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX9-NEXT: v_sub_u32_e32 v4, v4, v6 -; GFX9-NEXT: v_mul_hi_u32 v6, v7, s4 +; GFX9-NEXT: v_mul_hi_u32 v6, v5, s4 ; GFX9-NEXT: v_add_u32_e32 v4, 8, v4 ; GFX9-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 8, v3 -; GFX9-NEXT: v_mul_u32_u24_e32 v3, 24, v6 -; GFX9-NEXT: v_sub_u32_e32 v3, v5, v3 -; GFX9-NEXT: v_add_u32_e32 v3, 8, v3 -; GFX9-NEXT: v_alignbit_b32 v1, v1, v2, v3 +; GFX9-NEXT: v_mul_u32_u24_e32 v2, 24, v6 +; GFX9-NEXT: v_sub_u32_e32 v2, v5, v2 +; GFX9-NEXT: v_add_u32_e32 v2, 8, v2 +; GFX9-NEXT: v_alignbit_b32 v1, v1, v3, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; R600-LABEL: v_fshr_v2i24: @@ -2075,12 +2075,12 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-LABEL: v_fshr_v2i24: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX10-NEXT: v_and_b32_e32 v7, 0xffffff, v5 +; GFX10-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX10-NEXT: v_and_b32_e32 v5, 0xffffff, v5 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 8, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 +; GFX10-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX10-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 ; GFX10-NEXT: v_mul_u32_u24_e32 v6, 24, v6 ; GFX10-NEXT: v_mul_u32_u24_e32 v7, 24, v7 ; GFX10-NEXT: v_sub_nc_u32_e32 v4, v4, v6 @@ -2091,109 +2091,29 @@ define <2 x i24> @v_fshr_v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2 ; GFX10-NEXT: v_alignbit_b32 v1, v1, v3, v5 ; GFX10-NEXT: s_setpc_b64 s[30:31] ; -; GFX11-TRUE16-LABEL: v_fshr_v2i24: -; GFX11-TRUE16: ; %bb.0: -; GFX11-TRUE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX11-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX11-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l -; GFX11-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l -; GFX11-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX11-FAKE16-LABEL: v_fshr_v2i24: -; GFX11-FAKE16: ; %bb.0: -; GFX11-FAKE16-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX11-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX11-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX11-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX11-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX11-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX11-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX11-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX11-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX11-FAKE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-TRUE16-LABEL: v_fshr_v2i24: -; GFX12-TRUE16: ; %bb.0: -; GFX12-TRUE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_expcnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_samplecnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_bvhcnt 0x0 -; GFX12-TRUE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX12-TRUE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX12-TRUE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-TRUE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX12-TRUE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX12-TRUE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX12-TRUE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX12-TRUE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX12-TRUE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-TRUE16-NEXT: v_alignbit_b32 v0, v0, v2, v4.l -; GFX12-TRUE16-NEXT: v_alignbit_b32 v1, v1, v3, v5.l -; GFX12-TRUE16-NEXT: s_setpc_b64 s[30:31] -; -; GFX12-FAKE16-LABEL: v_fshr_v2i24: -; GFX12-FAKE16: ; %bb.0: -; GFX12-FAKE16-NEXT: s_wait_loadcnt_dscnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_expcnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_samplecnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_bvhcnt 0x0 -; GFX12-FAKE16-NEXT: s_wait_kmcnt 0x0 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v6, 0xffffff, v4 -; GFX12-FAKE16-NEXT: v_and_b32_e32 v7, 0xffffff, v5 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v2, 8, v2 -; GFX12-FAKE16-NEXT: v_lshlrev_b32_e32 v3, 8, v3 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX12-FAKE16-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v6 -; GFX12-FAKE16-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v6, 24, v6 -; GFX12-FAKE16-NEXT: v_mul_u32_u24_e32 v7, 24, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v4, v4, v6 -; GFX12-FAKE16-NEXT: v_sub_nc_u32_e32 v5, v5, v7 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v4, 8, v4 -; GFX12-FAKE16-NEXT: v_add_nc_u32_e32 v5, 8, v5 -; GFX12-FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX12-FAKE16-NEXT: v_alignbit_b32 v0, v0, v2, v4 -; GFX12-FAKE16-NEXT: v_alignbit_b32 v1, v1, v3, v5 -; GFX12-FAKE16-NEXT: s_setpc_b64 s[30:31] +; GFX11-LABEL: v_fshr_v2i24: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX11-NEXT: v_and_b32_e32 v4, 0xffffff, v4 +; GFX11-NEXT: v_and_b32_e32 v5, 0xffffff, v5 +; GFX11-NEXT: v_lshlrev_b32_e32 v2, 8, v2 +; GFX11-NEXT: v_lshlrev_b32_e32 v3, 8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_mul_hi_u32 v6, 0xaaaaaab, v4 +; GFX11-NEXT: v_mul_hi_u32 v7, 0xaaaaaab, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_mul_u32_u24_e32 v6, 24, v6 +; GFX11-NEXT: v_mul_u32_u24_e32 v7, 24, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_sub_nc_u32_e32 v4, v4, v6 +; GFX11-NEXT: v_sub_nc_u32_e32 v5, v5, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_add_nc_u32_e32 v4, 8, v4 +; GFX11-NEXT: v_add_nc_u32_e32 v5, 8, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) +; GFX11-NEXT: v_alignbit_b32 v0, v0, v2, v4 +; GFX11-NEXT: v_alignbit_b32 v1, v1, v3, v5 +; GFX11-NEXT: s_setpc_b64 s[30:31] %ret = call <2 x i24> @llvm.fshr.v2i24(<2 x i24> %src0, <2 x i24> %src1, <2 x i24> %src2) ret <2 x i24> %ret } diff --git a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll index b3c7ac80dd014..23ebfb817096f 100644 --- a/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll +++ b/llvm/test/CodeGen/AMDGPU/integer-select-src-modifiers.ll @@ -341,21 +341,24 @@ define <2 x i32> @s_fneg_select_v2i32_1(<2 x i32> inreg %cond, <2 x i32> inreg % ; GCN-LABEL: s_fneg_select_v2i32_1: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_b32 s4, s19, 0x80000000 -; GCN-NEXT: s_xor_b32 s5, s18, 0x80000000 +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_xor_b64 s[4:5], s[18:19], s[4:5] ; GCN-NEXT: s_cmp_eq_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s5, s5, s20 +; GCN-NEXT: s_cselect_b32 s4, s4, s20 ; GCN-NEXT: s_cmp_eq_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s4, s4, s21 -; GCN-NEXT: v_mov_b32_e32 v0, s5 -; GCN-NEXT: v_mov_b32_e32 v1, s4 +; GCN-NEXT: s_cselect_b32 s5, s5, s21 +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: s_fneg_select_v2i32_1: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_b32 s3, s3, 0x80000000 -; GFX11-NEXT: s_xor_b32 s2, s2, 0x80000000 +; GFX11-NEXT: s_brev_b32 s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_xor_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, s2, s16 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 @@ -373,12 +376,13 @@ define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> in ; GCN-LABEL: s_fneg_fabs_select_v2i32_2: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_bitset1_b32 s19, 31 -; GCN-NEXT: s_bitset1_b32 s18, 31 +; GCN-NEXT: s_brev_b32 s4, 1 +; GCN-NEXT: s_mov_b32 s5, s4 +; GCN-NEXT: s_or_b64 s[4:5], s[18:19], s[4:5] ; GCN-NEXT: s_cmp_eq_u32 s16, 0 -; GCN-NEXT: s_cselect_b32 s4, s20, s18 +; GCN-NEXT: s_cselect_b32 s4, s20, s4 ; GCN-NEXT: s_cmp_eq_u32 s17, 0 -; GCN-NEXT: s_cselect_b32 s5, s21, s19 +; GCN-NEXT: s_cselect_b32 s5, s21, s5 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -386,8 +390,10 @@ define <2 x i32> @s_fneg_fabs_select_v2i32_2(<2 x i32> inreg %cond, <2 x i32> in ; GFX11-LABEL: s_fneg_fabs_select_v2i32_2: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_bitset1_b32 s3, 31 -; GFX11-NEXT: s_bitset1_b32 s2, 31 +; GFX11-NEXT: s_brev_b32 s4, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mov_b32 s5, s4 +; GFX11-NEXT: s_or_b64 s[2:3], s[2:3], s[4:5] ; GFX11-NEXT: s_cmp_eq_u32 s0, 0 ; GFX11-NEXT: s_cselect_b32 s0, s16, s2 ; GFX11-NEXT: s_cmp_eq_u32 s1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/or.ll b/llvm/test/CodeGen/AMDGPU/or.ll index 7ef87a4e4d3c2..728067edcf399 100644 --- a/llvm/test/CodeGen/AMDGPU/or.ll +++ b/llvm/test/CodeGen/AMDGPU/or.ll @@ -1,7 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -check-prefix=GFX6 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck -check-prefix=GFX8 %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s + +define amdgpu_ps <2 x i32> @s_or_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { +; GFX6-LABEL: s_or_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX6-NEXT: ; return to shader part epilog +; +; GFX8-LABEL: s_or_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_or_b64 s[0:1], s[0:1], s[2:3] +; GFX8-NEXT: ; return to shader part epilog + %result = or <2 x i32> %num, %den + ret <2 x i32> %result +} + +define <2 x i32> @v_or_v2i32(<2 x i32> %num, <2 x i32> %den) { +; GFX6-LABEL: v_or_v2i32: +; GFX6: ; %bb.0: +; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: s_setpc_b64 s[30:31] +; +; GFX8-LABEL: v_or_v2i32: +; GFX8: ; %bb.0: +; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 +; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: s_setpc_b64 s[30:31] + %result = or <2 x i32> %num, %den + ret <2 x i32> %result +} define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { ; GFX6-LABEL: or_v2i32: @@ -18,8 +49,8 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX6-NEXT: s_mov_b32 s4, s0 ; GFX6-NEXT: s_mov_b32 s5, s1 ; GFX6-NEXT: s_waitcnt vmcnt(0) -; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX6-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX6-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX6-NEXT: s_endpgm ; @@ -37,28 +68,10 @@ define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: s_mov_b32 s4, s0 ; GFX8-NEXT: s_mov_b32 s5, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v2 +; GFX8-NEXT: v_or_b32_e32 v1, v1, v3 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: or_v2i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W, -; EG-NEXT: OR_INT T0.X, T0.X, T0.Z, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 %a = load <2 x i32>, ptr addrspace(1) %in %b = load <2 x i32>, ptr addrspace(1) %b_ptr @@ -111,27 +124,6 @@ define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: or_v4i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 -; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 11: -; EG-NEXT: OR_INT * T0.W, T0.W, T1.W, -; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z, -; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: OR_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 %a = load <4 x i32>, ptr addrspace(1) %in %b = load <4 x i32>, ptr addrspace(1) %b_ptr @@ -166,17 +158,6 @@ define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %or = or i32 %a, %b store i32 %or, ptr addrspace(1) %out ret void @@ -220,23 +201,6 @@ define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v0, s12, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %loada = load i32, ptr addrspace(1) %a %or = or i32 %loada, %b store i32 %or, ptr addrspace(1) %out @@ -267,17 +231,6 @@ define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_literal_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y, -; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40) %or = or i32 %a, 99999 store i32 %or, ptr addrspace(1) %out, align 4 ret void @@ -311,19 +264,6 @@ define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32 ; GFX8-NEXT: v_mov_b32_e32 v1, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_literal_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x, -; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) -; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) %or = or i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out ret void @@ -374,29 +314,6 @@ define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_literal_multi_use_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x, -; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) -; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x, -; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W, -; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, -; EG-NEXT: MOV * T2.X, literal.y, -; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) -; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x, -; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) -; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) %or = or i64 %a, 4261135838621753 store i64 %or, ptr addrspace(1) %out @@ -431,18 +348,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: MOV * T0.Y, KC0[5].X, -; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) %or = or i64 %a, 63 store i64 %or, ptr addrspace(1) %out ret void @@ -491,26 +396,6 @@ define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) % ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_inline_imm_multi_use_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 6: -; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x, -; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x, -; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) -; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W, -; EG-NEXT: MOV * T2.X, literal.x, -; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) -; EG-NEXT: MOV * T3.Y, KC0[3].X, -; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x, -; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, -; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) %or = or i64 %a, 63 store i64 %or, ptr addrspace(1) %out %foo = add i64 %b, 63 @@ -544,19 +429,6 @@ define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [ ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_neg_inline_imm_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -8(nan), 2(2.802597e-45) -; EG-NEXT: MOV * T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) %or = or i64 %a, -8 store i64 %or, ptr addrspace(1) %out ret void @@ -598,23 +470,6 @@ define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffff, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_literal_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) %loada = load i32, ptr addrspace(1) %a, align 4 %or = or i32 %loada, 65535 store i32 %or, ptr addrspace(1) %out, align 4 @@ -657,23 +512,6 @@ define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ; GFX8-NEXT: v_or_b32_e32 v0, 4, v0 ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_inline_immediate_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) %loada = load i32, ptr addrspace(1) %a, align 4 %or = or i32 %loada, 4 store i32 %or, ptr addrspace(1) %out, align 4 @@ -710,18 +548,6 @@ define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_or_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z, -; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %or = or i64 %a, %b store i64 %or, ptr addrspace(1) %out ret void @@ -773,26 +599,6 @@ define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) ; GFX8-NEXT: v_or_b32_e32 v1, v3, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, KC0[2].W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, -; EG-NEXT: OR_INT T0.X, T0.X, T1.X, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a, align 8 %loadb = load i64, ptr addrspace(1) %b, align 8 %or = or i64 %loada, %loadb @@ -840,24 +646,6 @@ define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrs ; GFX8-NEXT: v_or_b32_e32 v1, s13, v1 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: scalar_vector_or_i64: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X, -; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a %or = or i64 %loada, %b store i64 %or, ptr addrspace(1) %out @@ -902,25 +690,6 @@ define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addr ; GFX8-NEXT: v_or_b32_e32 v0, 0xdf77987f, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_i64_loadimm: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x, -; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00) -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, 22470723082367 store i64 %or, ptr addrspace(1) %out @@ -964,23 +733,6 @@ define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspac ; GFX8-NEXT: v_or_b32_e32 v0, 8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_i64_imm: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, 8 store i64 %or, ptr addrspace(1) %out @@ -1025,25 +777,6 @@ define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, p ; GFX8-NEXT: v_or_b32_e32 v0, -8, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_i64_neg_inline_imm: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -8(nan), 2(2.802597e-45) -; EG-NEXT: MOV * T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, -8 store i64 %or, ptr addrspace(1) %out @@ -1088,25 +821,6 @@ define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr ; GFX8-NEXT: v_or_b32_e32 v0, 0xffffff38, v0 ; GFX8-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: vector_or_i64_neg_literal: -; EG: ; %bb.0: -; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 0 @6 -; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 8: -; EG-NEXT: MOV * T0.X, KC0[2].Z, -; EG-NEXT: ALU clause starting at 9: -; EG-NEXT: OR_INT T0.X, T0.X, literal.x, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, -; EG-NEXT: -200(nan), 2(2.802597e-45) -; EG-NEXT: MOV * T0.Y, literal.x, -; EG-NEXT: -1(nan), 0(0.000000e+00) %loada = load i64, ptr addrspace(1) %a, align 8 %or = or i64 %loada, -200 store i64 %or, ptr addrspace(1) %out @@ -1139,17 +853,6 @@ define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: trunc_i64_or_to_i32: -; EG: ; %bb.0: -; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) -; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W, %add = or i64 %b, %a %trunc = trunc i64 %add to i32 store i32 %trunc, ptr addrspace(1) %out, align 8 @@ -1210,27 +913,6 @@ define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, p ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc ; GFX8-NEXT: buffer_store_dword v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: or_i1: -; EG: ; %bb.0: -; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] -; EG-NEXT: TEX 1 @6 -; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: Fetch clause starting at 6: -; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 -; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 -; EG-NEXT: ALU clause starting at 10: -; EG-NEXT: MOV T0.X, KC0[2].Z, -; EG-NEXT: MOV * T1.X, KC0[2].W, -; EG-NEXT: ALU clause starting at 12: -; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X, -; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0, -; EG-NEXT: AND_INT T0.X, PV.W, 1, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -1273,33 +955,9 @@ define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c ; GFX8-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1] ; GFX8-NEXT: buffer_store_byte v0, off, s[4:7], 0 ; GFX8-NEXT: s_endpgm -; -; EG-LABEL: s_or_i1: -; EG: ; %bb.0: -; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] -; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X -; EG-NEXT: CF_END -; EG-NEXT: PAD -; EG-NEXT: ALU clause starting at 4: -; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y, -; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W, -; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, -; EG-NEXT: OR_INT * T0.W, PS, PV.W, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: AND_INT T0.W, PS, 1, -; EG-NEXT: LSHL * T1.W, PV.W, literal.x, -; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) -; EG-NEXT: LSHL T0.X, PV.W, PS, -; EG-NEXT: LSHL * T0.W, literal.x, PS, -; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) -; EG-NEXT: MOV T0.Y, 0.0, -; EG-NEXT: MOV * T0.Z, 0.0, -; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, -; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) %cmp0 = icmp eq i32 %a, %b %cmp1 = icmp eq i32 %c, %d %or = or i1 %cmp0, %cmp1 store i1 %or, ptr addrspace(1) %out ret void } - diff --git a/llvm/test/CodeGen/AMDGPU/or.r600.ll b/llvm/test/CodeGen/AMDGPU/or.r600.ll new file mode 100644 index 0000000000000..ed9d0085fd82a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/or.r600.ll @@ -0,0 +1,515 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 3 +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=redwood < %s | FileCheck -check-prefix=EG %s + + +define amdgpu_kernel void @or_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: or_v2i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT * T0.Y, T0.Y, T0.W, +; EG-NEXT: OR_INT T0.X, T0.X, T0.Z, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <2 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <2 x i32>, ptr addrspace(1) %in + %b = load <2 x i32>, ptr addrspace(1) %b_ptr + %result = or <2 x i32> %a, %b + store <2 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @or_v4i32(ptr addrspace(1) %out, ptr addrspace(1) %in) { +; EG-LABEL: or_v4i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 5, @11, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_128 T1.XYZW, T0.X, 16, #1 +; EG-NEXT: VTX_READ_128 T0.XYZW, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 11: +; EG-NEXT: OR_INT * T0.W, T0.W, T1.W, +; EG-NEXT: OR_INT * T0.Z, T0.Z, T1.Z, +; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: OR_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %b_ptr = getelementptr <4 x i32>, ptr addrspace(1) %in, i32 1 + %a = load <4 x i32>, ptr addrspace(1) %in + %b = load <4 x i32>, ptr addrspace(1) %b_ptr + %result = or <4 x i32> %a, %b + store <4 x i32> %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_i32(ptr addrspace(1) %out, i32 %a, i32 %b) { +; EG-LABEL: scalar_or_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T1.X, KC0[2].Z, KC0[2].W, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %or = or i32 %a, %b + store i32 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, i32 %b) { +; EG-LABEL: vector_or_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %loada = load i32, ptr addrspace(1) %a + %or = or i32 %loada, %b + store i32 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_literal_i32(ptr addrspace(1) %out, i32 %a) { +; EG-LABEL: scalar_or_literal_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR T0.X, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T1.X, KC0[2].Z, literal.y, +; EG-NEXT: 2(2.802597e-45), 99999(1.401284e-40) + %or = or i32 %a, 99999 + store i32 %or, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @scalar_or_literal_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; EG-LABEL: scalar_or_literal_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: OR_INT * T0.Y, KC0[5].X, literal.x, +; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) + %or = or i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_literal_multi_use_i64(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { +; EG-LABEL: scalar_or_literal_multi_use_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 12, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: ADDC_UINT * T0.W, KC0[7].Y, literal.x, +; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) +; EG-NEXT: ADD_INT T0.X, KC0[7].Y, literal.x, +; EG-NEXT: ADD_INT * T0.W, KC0[7].Z, PV.W, +; EG-NEXT: 12345(1.729903e-41), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, PV.W, literal.x, +; EG-NEXT: MOV * T2.X, literal.y, +; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; EG-NEXT: OR_INT * T3.Y, KC0[5].X, literal.x, +; EG-NEXT: 992123(1.390260e-39), 0(0.000000e+00) +; EG-NEXT: OR_INT T3.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: 12345(1.729903e-41), 2(2.802597e-45) + %or = or i64 %a, 4261135838621753 + store i64 %or, ptr addrspace(1) %out + + %foo = add i64 %b, 4261135838621753 + store volatile i64 %foo, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @scalar_or_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; EG-LABEL: scalar_or_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: MOV * T0.Y, KC0[5].X, +; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) + %or = or i64 %a, 63 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_or_inline_imm_multi_use_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: scalar_or_inline_imm_multi_use_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 9, @6, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T3.XY, T4.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T2.X, 0 +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T2.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 6: +; EG-NEXT: ADD_INT T0.X, KC0[3].Y, literal.x, +; EG-NEXT: ADDC_UINT * T0.W, KC0[3].Y, literal.x, +; EG-NEXT: 63(8.828180e-44), 0(0.000000e+00) +; EG-NEXT: ADD_INT T1.X, KC0[3].Z, PV.W, +; EG-NEXT: MOV * T2.X, literal.x, +; EG-NEXT: 0(0.000000e+00), 0(0.000000e+00) +; EG-NEXT: MOV * T3.Y, KC0[3].X, +; EG-NEXT: OR_INT T3.X, KC0[2].W, literal.x, +; EG-NEXT: LSHR * T4.X, KC0[2].Y, literal.y, +; EG-NEXT: 63(8.828180e-44), 2(2.802597e-45) + %or = or i64 %a, 63 + store i64 %or, ptr addrspace(1) %out + %foo = add i64 %b, 63 + store volatile i64 %foo, ptr addrspace(1) poison + ret void +} + +define amdgpu_kernel void @scalar_or_neg_inline_imm_i64(ptr addrspace(1) %out, [8 x i32], i64 %a) { +; EG-LABEL: scalar_or_neg_inline_imm_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 4, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: OR_INT T0.X, KC0[4].W, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -8(nan), 2(2.802597e-45) +; EG-NEXT: MOV * T0.Y, literal.x, +; EG-NEXT: -1(nan), 0(0.000000e+00) + %or = or i64 %a, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_literal_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_literal_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 65535(9.183409e-41), 2(2.802597e-45) + %loada = load i32, ptr addrspace(1) %a, align 4 + %or = or i32 %loada, 65535 + store i32 %or, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @vector_or_inline_immediate_i32(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_inline_immediate_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 4(5.605194e-45), 2(2.802597e-45) + %loada = load i32, ptr addrspace(1) %a, align 4 + %or = or i32 %loada, 4 + store i32 %or, ptr addrspace(1) %out, align 4 + ret void +} + +define amdgpu_kernel void @scalar_or_i64(ptr addrspace(1) %out, i64 %a, i64 %b) { +; EG-LABEL: scalar_or_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 3, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: OR_INT * T0.Y, KC0[3].X, KC0[3].Z, +; EG-NEXT: OR_INT * T0.X, KC0[2].W, KC0[3].Y, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %or = or i64 %a, %b + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 3, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T1.XY, T1.X, 0, #1 +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, KC0[2].W, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: OR_INT * T0.Y, T0.Y, T1.Y, +; EG-NEXT: OR_INT T0.X, T0.X, T1.X, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a, align 8 + %loadb = load i64, ptr addrspace(1) %b, align 8 + %or = or i64 %loada, %loadb + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @scalar_vector_or_i64(ptr addrspace(1) %out, ptr addrspace(1) %a, i64 %b) { +; EG-LABEL: scalar_vector_or_i64: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 3, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT * T0.Y, T0.Y, KC0[3].X, +; EG-NEXT: OR_INT T0.X, T0.X, KC0[2].W, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a + %or = or i64 %loada, %b + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64_loadimm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_loadimm: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT * T0.Y, T0.Y, literal.x, +; EG-NEXT: 5231(7.330192e-42), 0(0.000000e+00) +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -545810305(-1.784115e+19), 2(2.802597e-45) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, 22470723082367 + store i64 %or, ptr addrspace(1) %out + ret void +} + +; FIXME: The or 0 should really be removed. +define amdgpu_kernel void @vector_or_i64_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_imm: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 2, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_64 T0.XY, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: 8(1.121039e-44), 2(2.802597e-45) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, 8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64_neg_inline_imm(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_neg_inline_imm: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -8(nan), 2(2.802597e-45) +; EG-NEXT: MOV * T0.Y, literal.x, +; EG-NEXT: -1(nan), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, -8 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @vector_or_i64_neg_literal(ptr addrspace(1) %out, ptr addrspace(1) %a, ptr addrspace(1) %b) { +; EG-LABEL: vector_or_i64_neg_literal: +; EG: ; %bb.0: +; EG-NEXT: ALU 0, @8, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 0 @6 +; EG-NEXT: ALU 4, @9, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XY, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 8: +; EG-NEXT: MOV * T0.X, KC0[2].Z, +; EG-NEXT: ALU clause starting at 9: +; EG-NEXT: OR_INT T0.X, T0.X, literal.x, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.y, +; EG-NEXT: -200(nan), 2(2.802597e-45) +; EG-NEXT: MOV * T0.Y, literal.x, +; EG-NEXT: -1(nan), 0(0.000000e+00) + %loada = load i64, ptr addrspace(1) %a, align 8 + %or = or i64 %loada, -200 + store i64 %or, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @trunc_i64_or_to_i32(ptr addrspace(1) %out, [8 x i32], i64 %a, [8 x i32], i64 %b) { +; EG-LABEL: trunc_i64_or_to_i32: +; EG: ; %bb.0: +; EG-NEXT: ALU 2, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T1.X, T0.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: LSHR * T0.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; EG-NEXT: OR_INT * T1.X, KC0[7].Y, KC0[4].W, + %add = or i64 %b, %a + %trunc = trunc i64 %add to i32 + store i32 %trunc, ptr addrspace(1) %out, align 8 + ret void +} + +define amdgpu_kernel void @or_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { +; EG-LABEL: or_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 1, @10, KC0[CB0:0-32], KC1[] +; EG-NEXT: TEX 1 @6 +; EG-NEXT: ALU 4, @12, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.X, T1.X, 1 +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: Fetch clause starting at 6: +; EG-NEXT: VTX_READ_32 T1.X, T1.X, 0, #1 +; EG-NEXT: VTX_READ_32 T0.X, T0.X, 0, #1 +; EG-NEXT: ALU clause starting at 10: +; EG-NEXT: MOV T0.X, KC0[2].Z, +; EG-NEXT: MOV * T1.X, KC0[2].W, +; EG-NEXT: ALU clause starting at 12: +; EG-NEXT: MAX_DX10 * T0.W, T0.X, T1.X, +; EG-NEXT: SETGE_DX10 * T0.W, PV.W, 0.0, +; EG-NEXT: AND_INT T0.X, PV.W, 1, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %a = load float, ptr addrspace(1) %in0 + %b = load float, ptr addrspace(1) %in1 + %acmp = fcmp oge float %a, 0.000000e+00 + %bcmp = fcmp oge float %b, 0.000000e+00 + %or = or i1 %acmp, %bcmp + %result = zext i1 %or to i32 + store i32 %result, ptr addrspace(1) %out + ret void +} + +define amdgpu_kernel void @s_or_i1(ptr addrspace(1) %out, i32 %a, i32 %b, i32 %c, i32 %d) { +; EG-LABEL: s_or_i1: +; EG: ; %bb.0: +; EG-NEXT: ALU 14, @4, KC0[CB0:0-32], KC1[] +; EG-NEXT: MEM_RAT MSKOR T0.XW, T1.X +; EG-NEXT: CF_END +; EG-NEXT: PAD +; EG-NEXT: ALU clause starting at 4: +; EG-NEXT: SETE_INT T0.W, KC0[3].X, KC0[3].Y, +; EG-NEXT: SETE_INT * T1.W, KC0[2].Z, KC0[2].W, +; EG-NEXT: AND_INT T2.W, KC0[2].Y, literal.x, +; EG-NEXT: OR_INT * T0.W, PS, PV.W, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: AND_INT T0.W, PS, 1, +; EG-NEXT: LSHL * T1.W, PV.W, literal.x, +; EG-NEXT: 3(4.203895e-45), 0(0.000000e+00) +; EG-NEXT: LSHL T0.X, PV.W, PS, +; EG-NEXT: LSHL * T0.W, literal.x, PS, +; EG-NEXT: 255(3.573311e-43), 0(0.000000e+00) +; EG-NEXT: MOV T0.Y, 0.0, +; EG-NEXT: MOV * T0.Z, 0.0, +; EG-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; EG-NEXT: 2(2.802597e-45), 0(0.000000e+00) + %cmp0 = icmp eq i32 %a, %b + %cmp1 = icmp eq i32 %c, %d + %or = or i1 %cmp0, %cmp1 + store i1 %or, ptr addrspace(1) %out + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/rotr.ll b/llvm/test/CodeGen/AMDGPU/rotr.ll index 74ac181c120b5..3e7b8f438efdb 100644 --- a/llvm/test/CodeGen/AMDGPU/rotr.ll +++ b/llvm/test/CodeGen/AMDGPU/rotr.ll @@ -228,6 +228,134 @@ entry: ret void } +define amdgpu_kernel void @rotr_v8i32(ptr addrspace(1) %in, <8 x i32> %x, <8 x i32> %y) { +; R600-LABEL: rotr_v8i32: +; R600: ; %bb.0: ; %entry +; R600-NEXT: ALU 13, @4, KC0[CB0:0-32], KC1[] +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T2.XYZW, T3.X, 0 +; R600-NEXT: MEM_RAT_CACHELESS STORE_RAW T0.XYZW, T1.X, 1 +; R600-NEXT: CF_END +; R600-NEXT: ALU clause starting at 4: +; R600-NEXT: BIT_ALIGN_INT * T0.W, KC0[5].X, KC0[5].X, KC0[7].X, +; R600-NEXT: BIT_ALIGN_INT * T0.Z, KC0[4].W, KC0[4].W, KC0[6].W, +; R600-NEXT: BIT_ALIGN_INT * T0.Y, KC0[4].Z, KC0[4].Z, KC0[6].Z, +; R600-NEXT: BIT_ALIGN_INT * T0.X, KC0[4].Y, KC0[4].Y, KC0[6].Y, +; R600-NEXT: LSHR * T1.X, KC0[2].Y, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; R600-NEXT: BIT_ALIGN_INT * T2.W, KC0[6].X, KC0[6].X, KC0[8].X, +; R600-NEXT: BIT_ALIGN_INT * T2.Z, KC0[5].W, KC0[5].W, KC0[7].W, +; R600-NEXT: BIT_ALIGN_INT * T2.Y, KC0[5].Z, KC0[5].Z, KC0[7].Z, +; R600-NEXT: BIT_ALIGN_INT * T2.X, KC0[5].Y, KC0[5].Y, KC0[7].Y, +; R600-NEXT: ADD_INT * T1.W, KC0[2].Y, literal.x, +; R600-NEXT: 16(2.242078e-44), 0(0.000000e+00) +; R600-NEXT: LSHR * T3.X, PV.W, literal.x, +; R600-NEXT: 2(2.802597e-45), 0(0.000000e+00) +; +; SI-LABEL: rotr_v8i32: +; SI: ; %bb.0: ; %entry +; SI-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x11 +; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 +; SI-NEXT: s_mov_b32 s3, 0xf000 +; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: s_waitcnt lgkmcnt(0) +; SI-NEXT: v_mov_b32_e32 v0, s19 +; SI-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; SI-NEXT: v_mov_b32_e32 v0, s18 +; SI-NEXT: v_alignbit_b32 v2, s10, s10, v0 +; SI-NEXT: v_mov_b32_e32 v0, s17 +; SI-NEXT: v_alignbit_b32 v1, s9, s9, v0 +; SI-NEXT: v_mov_b32_e32 v0, s16 +; SI-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; SI-NEXT: v_mov_b32_e32 v4, s23 +; SI-NEXT: v_alignbit_b32 v7, s15, s15, v4 +; SI-NEXT: v_mov_b32_e32 v4, s22 +; SI-NEXT: v_alignbit_b32 v6, s14, s14, v4 +; SI-NEXT: v_mov_b32_e32 v4, s21 +; SI-NEXT: v_alignbit_b32 v5, s13, s13, v4 +; SI-NEXT: v_mov_b32_e32 v4, s20 +; SI-NEXT: v_alignbit_b32 v4, s12, s12, v4 +; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 offset:16 +; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 +; SI-NEXT: s_endpgm +; +; GFX8-LABEL: rotr_v8i32: +; GFX8: ; %bb.0: ; %entry +; GFX8-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s18 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 +; GFX8-NEXT: v_alignbit_b32 v2, s10, s10, v1 +; GFX8-NEXT: v_alignbit_b32 v1, s9, s9, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s23 +; GFX8-NEXT: v_alignbit_b32 v7, s15, s15, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s22 +; GFX8-NEXT: s_add_u32 s2, s0, 16 +; GFX8-NEXT: v_alignbit_b32 v6, s14, s14, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s21 +; GFX8-NEXT: s_addc_u32 s3, s1, 0 +; GFX8-NEXT: v_alignbit_b32 v5, s13, s13, v4 +; GFX8-NEXT: v_mov_b32_e32 v4, s20 +; GFX8-NEXT: v_mov_b32_e32 v9, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s19 +; GFX8-NEXT: v_alignbit_b32 v4, s12, s12, v4 +; GFX8-NEXT: v_mov_b32_e32 v8, s2 +; GFX8-NEXT: v_alignbit_b32 v3, s11, s11, v0 +; GFX8-NEXT: v_mov_b32_e32 v0, s16 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GFX8-NEXT: v_alignbit_b32 v0, s8, s8, v0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 +; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; GFX8-NEXT: s_endpgm +; +; GFX10-LABEL: rotr_v8i32: +; GFX10: ; %bb.0: ; %entry +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x44 +; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x24 +; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: s_waitcnt lgkmcnt(0) +; GFX10-NEXT: v_alignbit_b32 v7, s15, s15, s23 +; GFX10-NEXT: v_alignbit_b32 v6, s14, s14, s22 +; GFX10-NEXT: v_alignbit_b32 v5, s13, s13, s21 +; GFX10-NEXT: v_alignbit_b32 v4, s12, s12, s20 +; GFX10-NEXT: v_alignbit_b32 v3, s11, s11, s19 +; GFX10-NEXT: v_alignbit_b32 v2, s10, s10, s18 +; GFX10-NEXT: v_alignbit_b32 v1, s9, s9, s17 +; GFX10-NEXT: v_alignbit_b32 v0, s8, s8, s16 +; GFX10-NEXT: global_store_dwordx4 v8, v[4:7], s[0:1] offset:16 +; GFX10-NEXT: global_store_dwordx4 v8, v[0:3], s[0:1] +; GFX10-NEXT: s_endpgm +; +; GFX11-LABEL: rotr_v8i32: +; GFX11: ; %bb.0: ; %entry +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: s_load_b512 s[8:23], s[4:5], 0x44 +; GFX11-NEXT: s_load_b64 s[0:1], s[4:5], 0x24 +; GFX11-NEXT: v_mov_b32_e32 v8, 0 +; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: v_alignbit_b32 v7, s15, s15, s23 +; GFX11-NEXT: v_alignbit_b32 v6, s14, s14, s22 +; GFX11-NEXT: v_alignbit_b32 v5, s13, s13, s21 +; GFX11-NEXT: v_alignbit_b32 v4, s12, s12, s20 +; GFX11-NEXT: v_alignbit_b32 v3, s11, s11, s19 +; GFX11-NEXT: v_alignbit_b32 v2, s10, s10, s18 +; GFX11-NEXT: v_alignbit_b32 v1, s9, s9, s17 +; GFX11-NEXT: v_alignbit_b32 v0, s8, s8, s16 +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: global_store_b128 v8, v[4:7], s[0:1] offset:16 +; GFX11-NEXT: global_store_b128 v8, v[0:3], s[0:1] +; GFX11-NEXT: s_endpgm +entry: + %tmp0 = sub <8 x i32> , %y + %tmp1 = shl <8 x i32> %x, %tmp0 + %tmp2 = lshr <8 x i32> %x, %y + %tmp3 = or <8 x i32> %tmp1, %tmp2 + store <8 x i32> %tmp3, ptr addrspace(1) %in + ret void +} + declare i16 @llvm.fshr.i16(i16, i16, i16) define void @test_rotr_i16(ptr addrspace(1) nocapture readonly %sourceA, ptr addrspace(1) nocapture readonly %sourceB, ptr addrspace(1) nocapture %destValues) { diff --git a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll index d496634ae474f..8af4a8de7b266 100644 --- a/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll +++ b/llvm/test/CodeGen/AMDGPU/vector_range_metadata.ll @@ -18,11 +18,11 @@ define <2 x i32> @test_add2x32(ptr %a_ptr, ptr %b_ptr) { ; CHECK-LABEL: test_add2x32: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: flat_load_dword v4, v[2:3] -; CHECK-NEXT: flat_load_dword v5, v[0:1] -; CHECK-NEXT: v_mov_b32_e32 v1, 48 +; CHECK-NEXT: flat_load_dwordx2 v[4:5], v[0:1] +; CHECK-NEXT: flat_load_dwordx2 v[6:7], v[2:3] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_or_b32_e32 v0, v5, v4 +; CHECK-NEXT: v_or_b32_e32 v1, v5, v7 +; CHECK-NEXT: v_or_b32_e32 v0, v4, v6 ; CHECK-NEXT: s_setpc_b64 s[30:31] %a = load <2 x i32>, ptr %a_ptr, !range !2, !noundef !{} %b = load <2 x i32>, ptr %b_ptr, !range !3, !noundef !{} diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll index 00bb7b24786f5..feb6ecd996516 100644 --- a/llvm/test/CodeGen/AMDGPU/xor.ll +++ b/llvm/test/CodeGen/AMDGPU/xor.ll @@ -1,6 +1,38 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI,GCN %s -; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI,GCN %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=verde < %s | FileCheck -enable-var-scope -check-prefixes=SI %s +; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -enable-var-scope -check-prefixes=VI %s + +define amdgpu_ps <2 x i32> @s_xor_v2i32(<2 x i32> inreg %num, <2 x i32> inreg %den) { +; SI-LABEL: s_xor_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; SI-NEXT: ; return to shader part epilog +; +; VI-LABEL: s_xor_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_xor_b64 s[0:1], s[0:1], s[2:3] +; VI-NEXT: ; return to shader part epilog + %result = xor <2 x i32> %num, %den + ret <2 x i32> %result +} + +define <2 x i32> @v_xor_v2i32(<2 x i32> %num, <2 x i32> %den) { +; SI-LABEL: v_xor_v2i32: +; SI: ; %bb.0: +; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; SI-NEXT: v_xor_b32_e32 v1, v1, v3 +; SI-NEXT: v_xor_b32_e32 v0, v0, v2 +; SI-NEXT: s_setpc_b64 s[30:31] +; +; VI-LABEL: v_xor_v2i32: +; VI: ; %bb.0: +; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) +; VI-NEXT: v_xor_b32_e32 v1, v1, v3 +; VI-NEXT: v_xor_b32_e32 v0, v0, v2 +; VI-NEXT: s_setpc_b64 s[30:31] + %result = xor <2 x i32> %num, %den + ret <2 x i32> %result +} define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in0, ptr addrspace(1) %in1) { ; SI-LABEL: xor_v2i32: @@ -21,8 +53,8 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 ; SI-NEXT: s_waitcnt vmcnt(0) -; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: v_xor_b32_e32 v0, v2, v0 +; SI-NEXT: v_xor_b32_e32 v1, v3, v1 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -40,10 +72,11 @@ define amdgpu_kernel void @xor_v2i32(ptr addrspace(1) %out, ptr addrspace(1) %in ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: v_xor_b32_e32 v0, v0, v2 +; VI-NEXT: v_xor_b32_e32 v1, v1, v3 ; VI-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; VI-NEXT: s_endpgm + %a = load <2 x i32>, ptr addrspace(1) %in0 %b = load <2 x i32>, ptr addrspace(1) %in1 %result = xor <2 x i32> %a, %b @@ -152,6 +185,7 @@ define amdgpu_kernel void @xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0, ; VI-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm + %a = load float, ptr addrspace(1) %in0 %b = load float, ptr addrspace(1) %in1 %acmp = fcmp oge float %a, 0.000000e+00 @@ -519,7 +553,7 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_cmp_ne_u64_e64 s[10:11], s[4:5], 0 ; SI-NEXT: s_and_b64 vcc, exec, s[10:11] -; SI-NEXT: s_cbranch_vccz .LBB12_4 +; SI-NEXT: s_cbranch_vccz .LBB14_4 ; SI-NEXT: ; %bb.1: ; %else ; SI-NEXT: s_mov_b32 s15, 0xf000 ; SI-NEXT: s_mov_b32 s14, -1 @@ -527,21 +561,21 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; SI-NEXT: s_mov_b32 s13, s3 ; SI-NEXT: buffer_load_dwordx2 v[0:1], off, s[12:15], 0 ; SI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; SI-NEXT: s_cbranch_vccnz .LBB12_3 -; SI-NEXT: .LBB12_2: ; %if +; SI-NEXT: s_cbranch_vccnz .LBB14_3 +; SI-NEXT: .LBB14_2: ; %if ; SI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: .LBB12_3: ; %endif +; SI-NEXT: .LBB14_3: ; %endif ; SI-NEXT: s_mov_b32 s3, 0xf000 ; SI-NEXT: s_mov_b32 s2, -1 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm -; SI-NEXT: .LBB12_4: +; SI-NEXT: .LBB14_4: ; SI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; SI-NEXT: s_branch .LBB12_2 +; SI-NEXT: s_branch .LBB14_2 ; ; VI-LABEL: xor_cf: ; VI: ; %bb.0: ; %entry @@ -549,27 +583,27 @@ define amdgpu_kernel void @xor_cf(ptr addrspace(1) %out, ptr addrspace(1) %in, i ; VI-NEXT: s_mov_b64 s[8:9], 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_cmp_lg_u64 s[4:5], 0 -; VI-NEXT: s_cbranch_scc0 .LBB12_4 +; VI-NEXT: s_cbranch_scc0 .LBB14_4 ; VI-NEXT: ; %bb.1: ; %else ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_andn2_b64 vcc, exec, s[8:9] -; VI-NEXT: s_cbranch_vccnz .LBB12_3 -; VI-NEXT: .LBB12_2: ; %if +; VI-NEXT: s_cbranch_vccnz .LBB14_3 +; VI-NEXT: .LBB14_2: ; %if ; VI-NEXT: s_xor_b64 s[2:3], s[4:5], s[6:7] ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 -; VI-NEXT: .LBB12_3: ; %endif +; VI-NEXT: .LBB14_3: ; %endif ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm -; VI-NEXT: .LBB12_4: +; VI-NEXT: .LBB14_4: ; VI-NEXT: ; implicit-def: $vgpr0_vgpr1 -; VI-NEXT: s_branch .LBB12_2 +; VI-NEXT: s_branch .LBB14_2 entry: %0 = icmp eq i64 %a, 0 br i1 %0, label %if, label %else @@ -729,6 +763,7 @@ define amdgpu_kernel void @scalar_xor_neg_inline_imm_i64(ptr addrspace(1) %out, ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm + %or = xor i64 %a, -8 store i64 %or, ptr addrspace(1) %out ret void @@ -808,10 +843,9 @@ define amdgpu_kernel void @vector_xor_literal_i64(ptr addrspace(1) %out, ptr add ; VI-NEXT: v_xor_b32_e32 v0, 0xdf77987f, v0 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm + %loada = load i64, ptr addrspace(1) %a, align 8 %or = xor i64 %loada, 22470723082367 store i64 %or, ptr addrspace(1) %out ret void } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; GCN: {{.*}}