diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 989ec8d02d2f1..94e90a84a2d41 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1001,6 +1001,17 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { return true; } +static bool isVectorElementTypeUpsized(EVT EltVT) { + // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for + // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use + // vectorized loads/stores with the actual element type for i8/i16 as that + // would require v8/v16 variants that do not exist. + // In order to load/store such vectors efficiently, in Type Legalization + // we split the vector into word-sized chunks (v2x16/v4i8). Now, we will + // lower to PTX as vectors of b32. + return Isv2x16VT(EltVT) || EltVT == MVT::v4i8; +} + bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { MemSDNode *MemSD = cast(N); EVT LoadedVT = MemSD->getMemoryVT(); @@ -1055,11 +1066,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { EVT EltVT = N->getValueType(0); - // v8x16 is a special case. PTX doesn't have ld.v8.16 - // instruction. Instead, we split the vector into v2x16 chunks and - // load them with ld.v4.b32. - if (Isv2x16VT(EltVT)) { - assert(N->getOpcode() == NVPTXISD::LoadV4 && "Unexpected load opcode."); + if (isVectorElementTypeUpsized(EltVT)) { EltVT = MVT::i32; FromType = NVPTX::PTXLdStInstCode::Untyped; FromTypeWidth = 32; @@ -1223,16 +1230,16 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { if (EltVT.isVector()) { NumElts = EltVT.getVectorNumElements(); EltVT = EltVT.getVectorElementType(); - // vectors of 16bits type are loaded/stored as multiples of v2x16 elements. + // vectors of 8/16bits type are loaded/stored as multiples of v4i8/v2x16 + // elements. if ((EltVT == MVT::f16 && OrigType == MVT::v2f16) || (EltVT == MVT::bf16 && OrigType == MVT::v2bf16) || - (EltVT == MVT::i16 && OrigType == MVT::v2i16)) { - assert(NumElts % 2 == 0 && "Vector must have even number of elements"); - EltVT = OrigType; - NumElts /= 2; - } else if (OrigType == MVT::v4i8) { + (EltVT == MVT::i16 && OrigType == MVT::v2i16) || + (EltVT == MVT::i8 && OrigType == MVT::v4i8)) { + assert(NumElts % OrigType.getVectorNumElements() == 0 && + "NumElts must be divisible by the number of elts in subvectors"); EltVT = OrigType; - NumElts = 1; + NumElts /= OrigType.getVectorNumElements(); } } @@ -1739,11 +1746,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { return false; } - // v8x16 is a special case. PTX doesn't have st.v8.x16 - // instruction. Instead, we split the vector into v2x16 chunks and - // store them with st.v4.b32. - if (Isv2x16VT(EltVT)) { - assert(N->getOpcode() == NVPTXISD::StoreV4 && "Unexpected load opcode."); + if (isVectorElementTypeUpsized(EltVT)) { EltVT = MVT::i32; ToType = NVPTX::PTXLdStInstCode::Untyped; ToTypeWidth = 32; diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 7aa8b6ff55e8a..3e01def18e6e7 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -136,6 +136,8 @@ static bool IsPTXVectorType(MVT VT) { case MVT::v4i1: case MVT::v2i8: case MVT::v4i8: + case MVT::v8i8: // <2 x i8x4> + case MVT::v16i8: // <4 x i8x4> case MVT::v2i16: case MVT::v4i16: case MVT::v8i16: // <4 x i16x2> @@ -160,6 +162,67 @@ static bool Is16bitsType(MVT VT) { VT.SimpleTy == MVT::i16); } +// When legalizing vector loads/stores, this function is called, which does two +// things: +// 1. Determines Whether the vector is something we want to custom lower, +// std::nullopt is returned if we do not want to custom lower it. +// 2. If we do want to handle it, returns two parameters: +// - unsigned int NumElts - The number of elements in the final vector +// - EVT EltVT - The type of the elements in the final vector +static std::optional> +getVectorLoweringShape(EVT VectorVT) { + if (!VectorVT.isVector() || !VectorVT.isSimple()) + return std::nullopt; + + EVT EltVT = VectorVT.getVectorElementType(); + unsigned NumElts = VectorVT.getVectorNumElements(); + + // We only handle "native" vector sizes for now, e.g. <4 x double> is not + // legal. We can (and should) split that into 2 stores of <2 x double> here + // but I'm leaving that as a TODO for now. + switch (VectorVT.getSimpleVT().SimpleTy) { + default: + return std::nullopt; + case MVT::v2i8: + case MVT::v2i16: + case MVT::v2i32: + case MVT::v2i64: + case MVT::v2f16: + case MVT::v2bf16: + case MVT::v2f32: + case MVT::v2f64: + case MVT::v4i8: + case MVT::v4i16: + case MVT::v4i32: + case MVT::v4f16: + case MVT::v4bf16: + case MVT::v4f32: + // This is a "native" vector type + return std::pair(NumElts, EltVT); + case MVT::v8i8: // <2 x i8x4> + case MVT::v8f16: // <4 x f16x2> + case MVT::v8bf16: // <4 x bf16x2> + case MVT::v8i16: // <4 x i16x2> + case MVT::v16i8: // <4 x i8x4> + // This can be upsized into a "native" vector type. + // Despite vectors like v8i8, v16i8, v8i16 being within the bit-limit for + // total load/store size, PTX syntax only supports v2/v4. Thus, we can't use + // vectorized loads/stores with the actual element type for i8/i16 as that + // would require v8/v16 variants that do not exist. + // In order to load/store such vectors efficiently, here in Type + // Legalization, we split the vector into word-sized chunks (v2x16/v4i8). + // Later, we will lower to PTX as vectors of b32. + + // Number of elements to pack in one word. + unsigned NPerWord = 32 / EltVT.getSizeInBits(); + + return std::pair(NumElts / NPerWord, + MVT::getVectorVT(EltVT.getSimpleVT(), NPerWord)); + } + + llvm_unreachable("All cases in switch should return."); +}; + /// ComputePTXValueVTs - For the given Type \p Ty, returns the set of primitive /// EVTs that compose it. Unlike ComputeValueVTs, this will break apart vectors /// into their primitive components. @@ -766,8 +829,8 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM, // We have some custom DAG combine patterns for these nodes setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD, - ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, - ISD::VSELECT, ISD::BUILD_VECTOR}); + ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM, ISD::VSELECT, + ISD::BUILD_VECTOR}); // setcc for f16x2 and bf16x2 needs special handling to prevent // legalizer's attempt to scalarize it due to v2i1 not being legal. @@ -2807,122 +2870,86 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { SDLoc DL(N); EVT ValVT = Val.getValueType(); - if (ValVT.isVector()) { - // We only handle "native" vector sizes for now, e.g. <4 x double> is not - // legal. We can (and should) split that into 2 stores of <2 x double> here - // but I'm leaving that as a TODO for now. - if (!ValVT.isSimple()) - return SDValue(); - switch (ValVT.getSimpleVT().SimpleTy) { - default: - return SDValue(); - case MVT::v2i8: - case MVT::v2i16: - case MVT::v2i32: - case MVT::v2i64: - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v2f32: - case MVT::v2f64: - case MVT::v4i8: - case MVT::v4i16: - case MVT::v4i32: - case MVT::v4f16: - case MVT::v4bf16: - case MVT::v4f32: - case MVT::v8f16: // <4 x f16x2> - case MVT::v8bf16: // <4 x bf16x2> - case MVT::v8i16: // <4 x i16x2> - // This is a "native" vector type - break; - } - - MemSDNode *MemSD = cast(N); - const DataLayout &TD = DAG.getDataLayout(); - - Align Alignment = MemSD->getAlign(); - Align PrefAlign = - TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); - if (Alignment < PrefAlign) { - // This store is not sufficiently aligned, so bail out and let this vector - // store be scalarized. Note that we may still be able to emit smaller - // vector stores. For example, if we are storing a <4 x float> with an - // alignment of 8, this check will fail but the legalizer will try again - // with 2 x <2 x float>, which will succeed with an alignment of 8. - return SDValue(); - } + auto NumEltsAndEltVT = getVectorLoweringShape(ValVT); + if (!NumEltsAndEltVT) + return SDValue(); + auto [NumElts, EltVT] = NumEltsAndEltVT.value(); - unsigned Opcode = 0; - EVT EltVT = ValVT.getVectorElementType(); - unsigned NumElts = ValVT.getVectorNumElements(); + MemSDNode *MemSD = cast(N); + const DataLayout &TD = DAG.getDataLayout(); - // Since StoreV2 is a target node, we cannot rely on DAG type legalization. - // Therefore, we must ensure the type is legal. For i1 and i8, we set the - // stored type to i16 and propagate the "real" type as the memory type. - bool NeedExt = false; - if (EltVT.getSizeInBits() < 16) - NeedExt = true; + Align Alignment = MemSD->getAlign(); + Align PrefAlign = TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext())); + if (Alignment < PrefAlign) { + // This store is not sufficiently aligned, so bail out and let this vector + // store be scalarized. Note that we may still be able to emit smaller + // vector stores. For example, if we are storing a <4 x float> with an + // alignment of 8, this check will fail but the legalizer will try again + // with 2 x <2 x float>, which will succeed with an alignment of 8. + return SDValue(); + } - bool StoreF16x2 = false; - switch (NumElts) { - default: - return SDValue(); - case 2: - Opcode = NVPTXISD::StoreV2; - break; - case 4: - Opcode = NVPTXISD::StoreV4; - break; - case 8: - // v8f16 is a special case. PTX doesn't have st.v8.f16 - // instruction. Instead, we split the vector into v2f16 chunks and - // store them with st.v4.b32. - assert(Is16bitsType(EltVT.getSimpleVT()) && "Wrong type for the vector."); - Opcode = NVPTXISD::StoreV4; - StoreF16x2 = true; - break; - } + // Since StoreV2 is a target node, we cannot rely on DAG type legalization. + // Therefore, we must ensure the type is legal. For i1 and i8, we set the + // stored type to i16 and propagate the "real" type as the memory type. + bool NeedExt = false; + if (EltVT.getSizeInBits() < 16) + NeedExt = true; - SmallVector Ops; + unsigned Opcode = 0; + switch (NumElts) { + default: + return SDValue(); + case 2: + Opcode = NVPTXISD::StoreV2; + break; + case 4: + Opcode = NVPTXISD::StoreV4; + break; + } - // First is the chain - Ops.push_back(N->getOperand(0)); + SmallVector Ops; - if (StoreF16x2) { - // Combine f16,f16 -> v2f16 - NumElts /= 2; - for (unsigned i = 0; i < NumElts; ++i) { - SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, - DAG.getIntPtrConstant(i * 2, DL)); - SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, - DAG.getIntPtrConstant(i * 2 + 1, DL)); - EVT VecVT = EVT::getVectorVT(*DAG.getContext(), EltVT, 2); - SDValue V2 = DAG.getNode(ISD::BUILD_VECTOR, DL, VecVT, E0, E1); - Ops.push_back(V2); - } - } else { - // Then the split values - for (unsigned i = 0; i < NumElts; ++i) { - SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, - DAG.getIntPtrConstant(i, DL)); - if (NeedExt) - ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); - Ops.push_back(ExtVal); - } + // First is the chain + Ops.push_back(N->getOperand(0)); + + // Then the split values + assert(NumElts <= ValVT.getVectorNumElements() && + "NumElts should not increase, only decrease or stay the same."); + if (NumElts < ValVT.getVectorNumElements()) { + // If the number of elements has decreased, getVectorLoweringShape has + // upsized the element types + assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 && + EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type."); + // Combine individual elements into v2[i,f,bf]16/v4i8 subvectors to be + // stored as b32s + unsigned NumEltsPerSubVector = EltVT.getVectorNumElements(); + for (unsigned i = 0; i < NumElts; ++i) { + SmallVector SubVectorElts; + DAG.ExtractVectorElements(Val, SubVectorElts, i * NumEltsPerSubVector, + NumEltsPerSubVector); + SDValue SubVector = DAG.getBuildVector(EltVT, DL, SubVectorElts); + Ops.push_back(SubVector); } + } else { + for (unsigned i = 0; i < NumElts; ++i) { + SDValue ExtVal = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Val, + DAG.getIntPtrConstant(i, DL)); + if (NeedExt) + ExtVal = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i16, ExtVal); + Ops.push_back(ExtVal); + } + } - // Then any remaining arguments - Ops.append(N->op_begin() + 2, N->op_end()); - - SDValue NewSt = - DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, - MemSD->getMemoryVT(), MemSD->getMemOperand()); + // Then any remaining arguments + Ops.append(N->op_begin() + 2, N->op_end()); - // return DCI.CombineTo(N, NewSt, true); - return NewSt; - } + SDValue NewSt = + DAG.getMemIntrinsicNode(Opcode, DL, DAG.getVTList(MVT::Other), Ops, + MemSD->getMemoryVT(), MemSD->getMemOperand()); - return SDValue(); + // return DCI.CombineTo(N, NewSt, true); + return NewSt; } // st i1 v, addr @@ -5077,49 +5104,6 @@ static SDValue PerformVSELECTCombine(SDNode *N, return DCI.DAG.getNode(ISD::BUILD_VECTOR, DL, MVT::v4i8, E); } -static SDValue PerformLOADCombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - SelectionDAG &DAG = DCI.DAG; - LoadSDNode *LD = cast(N); - - // Lower a v16i8 load into a LoadV4 operation with i32 results instead of - // letting ReplaceLoadVector split it into smaller loads during legalization. - // This is done at dag-combine1 time, so that vector operations with i8 - // elements can be optimised away instead of being needlessly split during - // legalization, which involves storing to the stack and loading it back. - EVT VT = N->getValueType(0); - bool CorrectlyAligned = - DCI.DAG.getTargetLoweringInfo().allowsMemoryAccessForAlignment( - *DAG.getContext(), DAG.getDataLayout(), LD->getMemoryVT(), - *LD->getMemOperand()); - if (!(VT == MVT::v16i8 && CorrectlyAligned)) - return SDValue(); - - SDLoc DL(N); - - // Create a v4i32 vector load operation, effectively <4 x v4i8>. - unsigned Opc = NVPTXISD::LoadV4; - EVT NewVT = MVT::v4i32; - EVT EltVT = NewVT.getVectorElementType(); - unsigned NumElts = NewVT.getVectorNumElements(); - EVT RetVTs[] = {EltVT, EltVT, EltVT, EltVT, MVT::Other}; - SDVTList RetVTList = DAG.getVTList(RetVTs); - SmallVector Ops(N->ops()); - Ops.push_back(DAG.getIntPtrConstant(LD->getExtensionType(), DL)); - SDValue NewLoad = DAG.getMemIntrinsicNode(Opc, DL, RetVTList, Ops, NewVT, - LD->getMemOperand()); - SDValue NewChain = NewLoad.getValue(NumElts); - - // Create a vector of the same type returned by the original load. - SmallVector Elts; - for (unsigned i = 0; i < NumElts; i++) - Elts.push_back(NewLoad.getValue(i)); - return DCI.DAG.getMergeValues( - {DCI.DAG.getBitcast(VT, DCI.DAG.getBuildVector(NewVT, DL, Elts)), - NewChain}, - DL); -} - static SDValue PerformBUILD_VECTORCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) { auto VT = N->getValueType(0); @@ -5200,8 +5184,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N, return PerformREMCombine(N, DCI, OptLevel); case ISD::SETCC: return PerformSETCCCombine(N, DCI, STI.getSmVersion()); - case ISD::LOAD: - return PerformLOADCombine(N, DCI); case NVPTXISD::StoreRetval: case NVPTXISD::StoreRetvalV2: case NVPTXISD::StoreRetvalV4: @@ -5250,32 +5232,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, assert(ResVT.isVector() && "Vector load must have vector type"); - // We only handle "native" vector sizes for now, e.g. <4 x double> is not - // legal. We can (and should) split that into 2 loads of <2 x double> here - // but I'm leaving that as a TODO for now. - assert(ResVT.isSimple() && "Can only handle simple types"); - switch (ResVT.getSimpleVT().SimpleTy) { - default: + auto NumEltsAndEltVT = getVectorLoweringShape(ResVT); + if (!NumEltsAndEltVT) return; - case MVT::v2i8: - case MVT::v2i16: - case MVT::v2i32: - case MVT::v2i64: - case MVT::v2f16: - case MVT::v2f32: - case MVT::v2f64: - case MVT::v4i8: - case MVT::v4i16: - case MVT::v4i32: - case MVT::v4f16: - case MVT::v4bf16: - case MVT::v4f32: - case MVT::v8f16: // <4 x f16x2> - case MVT::v8bf16: // <4 x bf16x2> - case MVT::v8i16: // <4 x i16x2> - // This is a "native" vector type - break; - } + auto [NumElts, EltVT] = NumEltsAndEltVT.value(); LoadSDNode *LD = cast(N); @@ -5292,9 +5252,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, return; } - EVT EltVT = ResVT.getVectorElementType(); - unsigned NumElts = ResVT.getVectorNumElements(); - // Since LoadV2 is a target node, we cannot rely on DAG type legalization. // Therefore, we must ensure the type is legal. For i1 and i8, we set the // loaded type to i16 and propagate the "real" type as the memory type. @@ -5306,7 +5263,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, unsigned Opcode = 0; SDVTList LdResVTs; - bool Load16x2 = false; switch (NumElts) { default: @@ -5321,31 +5277,6 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LdResVTs = DAG.getVTList(ListVTs); break; } - case 8: { - // v8f16 is a special case. PTX doesn't have ld.v8.f16 - // instruction. Instead, we split the vector into v2f16 chunks and - // load them with ld.v4.b32. - assert(Is16bitsType(EltVT.getSimpleVT()) && "Unsupported v8 vector type."); - Load16x2 = true; - Opcode = NVPTXISD::LoadV4; - EVT VVT; - switch (EltVT.getSimpleVT().SimpleTy) { - case MVT::f16: - VVT = MVT::v2f16; - break; - case MVT::bf16: - VVT = MVT::v2bf16; - break; - case MVT::i16: - VVT = MVT::v2i16; - break; - default: - llvm_unreachable("Unsupported v8 vector type."); - } - EVT ListVTs[] = {VVT, VVT, VVT, VVT, MVT::Other}; - LdResVTs = DAG.getVTList(ListVTs); - break; - } } // Copy regular operands @@ -5359,18 +5290,19 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, LD->getMemoryVT(), LD->getMemOperand()); - SmallVector ScalarRes; - if (Load16x2) { - // Split v2f16 subvectors back into individual elements. - NumElts /= 2; + SmallVector ScalarRes; + assert(NumElts <= ResVT.getVectorNumElements() && + "NumElts should not increase, only decrease or stay the same."); + if (NumElts < ResVT.getVectorNumElements()) { + // If the number of elements has decreased, getVectorLoweringShape has + // upsized the element types + assert(EltVT.isVector() && EltVT.getSizeInBits() == 32 && + EltVT.getVectorNumElements() <= 4 && "Unexpected upsized type."); + // Generate EXTRACT_VECTOR_ELTs to split v2[i,f,bf]16/v4i8 subvectors back + // into individual elements. for (unsigned i = 0; i < NumElts; ++i) { SDValue SubVector = NewLD.getValue(i); - SDValue E0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, - DAG.getIntPtrConstant(0, DL)); - SDValue E1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, SubVector, - DAG.getIntPtrConstant(1, DL)); - ScalarRes.push_back(E0); - ScalarRes.push_back(E1); + DAG.ExtractVectorElements(SubVector, ScalarRes); } } else { for (unsigned i = 0; i < NumElts; ++i) { diff --git a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll index 028fab7ae54d6..e46657e4a582f 100644 --- a/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll +++ b/llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll @@ -172,30 +172,34 @@ define float @ff(ptr %p) { define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr2) { ; ENABLED-LABEL: combine_v16i8( ; ENABLED: { -; ENABLED-NEXT: .reg .b32 %r<40>; +; ENABLED-NEXT: .reg .b32 %r<36>; ; ENABLED-NEXT: .reg .b64 %rd<3>; ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_param_0]; -; ENABLED-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_param_1]; -; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8; -; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8; -; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8; -; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8; -; ENABLED-NEXT: bfe.u32 %r13, %r2, 0, 8; -; ENABLED-NEXT: bfe.u32 %r14, %r2, 8, 8; -; ENABLED-NEXT: bfe.u32 %r15, %r2, 16, 8; -; ENABLED-NEXT: bfe.u32 %r16, %r2, 24, 8; -; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8; -; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8; -; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8; -; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8; -; ENABLED-NEXT: bfe.u32 %r21, %r4, 0, 8; -; ENABLED-NEXT: bfe.u32 %r22, %r4, 8, 8; -; ENABLED-NEXT: bfe.u32 %r23, %r4, 16, 8; -; ENABLED-NEXT: bfe.u32 %r24, %r4, 24, 8; -; ENABLED-NEXT: add.s32 %r25, %r9, %r10; +; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8; +; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8; +; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8; +; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8; +; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8; +; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8; +; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8; +; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8; +; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8; +; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8; +; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8; +; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8; +; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8; +; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8; +; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8; +; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8; +; ENABLED-NEXT: add.s32 %r21, %r5, %r6; +; ENABLED-NEXT: add.s32 %r22, %r21, %r7; +; ENABLED-NEXT: add.s32 %r23, %r22, %r8; +; ENABLED-NEXT: add.s32 %r24, %r23, %r9; +; ENABLED-NEXT: add.s32 %r25, %r24, %r10; ; ENABLED-NEXT: add.s32 %r26, %r25, %r11; ; ENABLED-NEXT: add.s32 %r27, %r26, %r12; ; ENABLED-NEXT: add.s32 %r28, %r27, %r13; @@ -206,11 +210,7 @@ define void @combine_v16i8(ptr noundef align 16 %ptr1, ptr noundef align 16 %ptr ; ENABLED-NEXT: add.s32 %r33, %r32, %r18; ; ENABLED-NEXT: add.s32 %r34, %r33, %r19; ; ENABLED-NEXT: add.s32 %r35, %r34, %r20; -; ENABLED-NEXT: add.s32 %r36, %r35, %r21; -; ENABLED-NEXT: add.s32 %r37, %r36, %r22; -; ENABLED-NEXT: add.s32 %r38, %r37, %r23; -; ENABLED-NEXT: add.s32 %r39, %r38, %r24; -; ENABLED-NEXT: st.u32 [%rd2], %r39; +; ENABLED-NEXT: st.u32 [%rd2], %r35; ; ENABLED-NEXT: ret; ; ; DISABLED-LABEL: combine_v16i8( @@ -328,27 +328,25 @@ define void @combine_v16i8_unaligned(ptr noundef align 8 %ptr1, ptr noundef alig ; ENABLED-EMPTY: ; ENABLED-NEXT: // %bb.0: ; ENABLED-NEXT: ld.param.u64 %rd1, [combine_v16i8_unaligned_param_0]; -; ENABLED-NEXT: ld.u32 %r1, [%rd1+4]; -; ENABLED-NEXT: ld.u32 %r2, [%rd1]; +; ENABLED-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; ; ENABLED-NEXT: ld.param.u64 %rd2, [combine_v16i8_unaligned_param_1]; -; ENABLED-NEXT: ld.u32 %r3, [%rd1+12]; -; ENABLED-NEXT: ld.u32 %r4, [%rd1+8]; -; ENABLED-NEXT: bfe.u32 %r5, %r2, 0, 8; -; ENABLED-NEXT: bfe.u32 %r6, %r2, 8, 8; -; ENABLED-NEXT: bfe.u32 %r7, %r2, 16, 8; -; ENABLED-NEXT: bfe.u32 %r8, %r2, 24, 8; -; ENABLED-NEXT: bfe.u32 %r9, %r1, 0, 8; -; ENABLED-NEXT: bfe.u32 %r10, %r1, 8, 8; -; ENABLED-NEXT: bfe.u32 %r11, %r1, 16, 8; -; ENABLED-NEXT: bfe.u32 %r12, %r1, 24, 8; -; ENABLED-NEXT: bfe.u32 %r13, %r4, 0, 8; -; ENABLED-NEXT: bfe.u32 %r14, %r4, 8, 8; -; ENABLED-NEXT: bfe.u32 %r15, %r4, 16, 8; -; ENABLED-NEXT: bfe.u32 %r16, %r4, 24, 8; -; ENABLED-NEXT: bfe.u32 %r17, %r3, 0, 8; -; ENABLED-NEXT: bfe.u32 %r18, %r3, 8, 8; -; ENABLED-NEXT: bfe.u32 %r19, %r3, 16, 8; -; ENABLED-NEXT: bfe.u32 %r20, %r3, 24, 8; +; ENABLED-NEXT: ld.v2.b32 {%r3, %r4}, [%rd1+8]; +; ENABLED-NEXT: bfe.u32 %r5, %r1, 0, 8; +; ENABLED-NEXT: bfe.u32 %r6, %r1, 8, 8; +; ENABLED-NEXT: bfe.u32 %r7, %r1, 16, 8; +; ENABLED-NEXT: bfe.u32 %r8, %r1, 24, 8; +; ENABLED-NEXT: bfe.u32 %r9, %r2, 0, 8; +; ENABLED-NEXT: bfe.u32 %r10, %r2, 8, 8; +; ENABLED-NEXT: bfe.u32 %r11, %r2, 16, 8; +; ENABLED-NEXT: bfe.u32 %r12, %r2, 24, 8; +; ENABLED-NEXT: bfe.u32 %r13, %r3, 0, 8; +; ENABLED-NEXT: bfe.u32 %r14, %r3, 8, 8; +; ENABLED-NEXT: bfe.u32 %r15, %r3, 16, 8; +; ENABLED-NEXT: bfe.u32 %r16, %r3, 24, 8; +; ENABLED-NEXT: bfe.u32 %r17, %r4, 0, 8; +; ENABLED-NEXT: bfe.u32 %r18, %r4, 8, 8; +; ENABLED-NEXT: bfe.u32 %r19, %r4, 16, 8; +; ENABLED-NEXT: bfe.u32 %r20, %r4, 24, 8; ; ENABLED-NEXT: add.s32 %r21, %r5, %r6; ; ENABLED-NEXT: add.s32 %r22, %r21, %r7; ; ENABLED-NEXT: add.s32 %r23, %r22, %r8; diff --git a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll index c143d7674a792..3853ec5c4151a 100644 --- a/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll +++ b/llvm/test/CodeGen/NVPTX/i8x4-instructions.ll @@ -809,10 +809,8 @@ define void @test_ldst_v8i8(ptr %a, ptr %b) { ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u64 %rd2, [test_ldst_v8i8_param_1]; ; CHECK-NEXT: ld.param.u64 %rd1, [test_ldst_v8i8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: ld.u32 %r2, [%rd1+4]; -; CHECK-NEXT: st.u32 [%rd2+4], %r2; -; CHECK-NEXT: st.u32 [%rd2], %r1; +; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: st.v2.b32 [%rd2], {%r1, %r2}; ; CHECK-NEXT: ret; %t1 = load <8 x i8>, ptr %a store <8 x i8> %t1, ptr %b, align 16 diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll index 2a645c2249fd2..16a0189e784bd 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant.ll @@ -1,21 +1,45 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_35 -verify-machineinstrs | FileCheck %s ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_35 | %ptxas-verify %} ; Check that invariant loads from the global addrspace are lowered to ; ld.global.nc. -; CHECK-LABEL: @ld_global define i32 @ld_global(ptr addrspace(1) %ptr) { -; CHECK: ld.global.nc.{{[a-z]}}32 +; CHECK-LABEL: ld_global( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_param_0]; +; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %a = load i32, ptr addrspace(1) %ptr, !invariant.load !0 ret i32 %a } -; CHECK-LABEL: @ld_global_v2f16 define half @ld_global_v2f16(ptr addrspace(1) %ptr) { ; Load of v2f16 is weird. We consider it to be a legal type, which happens to be ; loaded/stored as a 32-bit scalar. -; CHECK: ld.global.nc.u32 +; CHECK-LABEL: ld_global_v2f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<4>; +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .f32 %f<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2f16_param_0]; +; CHECK-NEXT: ld.global.nc.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; +; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs3, %f3; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs3; +; CHECK-NEXT: ret; %a = load <2 x half>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <2 x half> %a, i32 0 %v2 = extractelement <2 x half> %a, i32 1 @@ -23,12 +47,33 @@ define half @ld_global_v2f16(ptr addrspace(1) %ptr) { ret half %sum } -; CHECK-LABEL: @ld_global_v4f16 define half @ld_global_v4f16(ptr addrspace(1) %ptr) { ; Larger f16 vectors may be split into individual f16 elements and multiple ; loads/stores may be vectorized using f16 element type. Practically it's ; limited to v4 variant only. -; CHECK: ld.global.nc.v4.u16 +; CHECK-LABEL: ld_global_v4f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<8>; +; CHECK-NEXT: .reg .f32 %f<10>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4f16_param_0]; +; CHECK-NEXT: ld.global.nc.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: cvt.f32.f16 %f1, %rs2; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs1; +; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NEXT: cvt.f32.f16 %f4, %rs4; +; CHECK-NEXT: cvt.f32.f16 %f5, %rs3; +; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4; +; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NEXT: cvt.f32.f16 %f7, %rs6; +; CHECK-NEXT: cvt.f32.f16 %f8, %rs5; +; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7; +; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-NEXT: ret; %a = load <4 x half>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <4 x half> %a, i32 0 %v2 = extractelement <4 x half> %a, i32 1 @@ -40,11 +85,37 @@ define half @ld_global_v4f16(ptr addrspace(1) %ptr) { ret half %sum } -; CHECK-LABEL: @ld_global_v8f16 define half @ld_global_v8f16(ptr addrspace(1) %ptr) { ; Larger vectors are, again, loaded as v4i32. PTX has no v8 variants of loads/stores, ; so load/store vectorizer has to convert v8f16 -> v4 x v2f16. -; CHECK: ld.global.nc.v4.u32 +; CHECK-LABEL: ld_global_v8f16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<8>; +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .f32 %f<10>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8f16_param_0]; +; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; } +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs2, tmp}, %r4; } +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs3, tmp}, %r1; } +; CHECK-NEXT: { .reg .b16 tmp; mov.b32 {%rs4, tmp}, %r2; } +; CHECK-NEXT: cvt.f32.f16 %f1, %rs4; +; CHECK-NEXT: cvt.f32.f16 %f2, %rs3; +; CHECK-NEXT: add.rn.f32 %f3, %f2, %f1; +; CHECK-NEXT: cvt.rn.f16.f32 %rs5, %f3; +; CHECK-NEXT: cvt.f32.f16 %f4, %rs2; +; CHECK-NEXT: cvt.f32.f16 %f5, %rs1; +; CHECK-NEXT: add.rn.f32 %f6, %f5, %f4; +; CHECK-NEXT: cvt.rn.f16.f32 %rs6, %f6; +; CHECK-NEXT: cvt.f32.f16 %f7, %rs6; +; CHECK-NEXT: cvt.f32.f16 %f8, %rs5; +; CHECK-NEXT: add.rn.f32 %f9, %f8, %f7; +; CHECK-NEXT: cvt.rn.f16.f32 %rs7, %f9; +; CHECK-NEXT: st.param.b16 [func_retval0], %rs7; +; CHECK-NEXT: ret; %a = load <8 x half>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <8 x half> %a, i32 0 %v2 = extractelement <8 x half> %a, i32 2 @@ -56,9 +127,110 @@ define half @ld_global_v8f16(ptr addrspace(1) %ptr) { ret half %sum } -; CHECK-LABEL: @ld_global_v2i32 +define i8 @ld_global_v8i8(ptr addrspace(1) %ptr) { +; CHECK-LABEL: ld_global_v8i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<8>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v8i8_param_0]; +; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: bfe.u32 %r4, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r4; +; CHECK-NEXT: bfe.u32 %r5, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: bfe.u32 %r6, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r6; +; CHECK-NEXT: add.s16 %rs5, %rs4, %rs3; +; CHECK-NEXT: add.s16 %rs6, %rs2, %rs1; +; CHECK-NEXT: add.s16 %rs7, %rs5, %rs6; +; CHECK-NEXT: cvt.u32.u16 %r7, %rs7; +; CHECK-NEXT: and.b32 %r8, %r7, 255; +; CHECK-NEXT: st.param.b32 [func_retval0], %r8; +; CHECK-NEXT: ret; + %a = load <8 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <8 x i8> %a, i32 0 + %v2 = extractelement <8 x i8> %a, i32 2 + %v3 = extractelement <8 x i8> %a, i32 4 + %v4 = extractelement <8 x i8> %a, i32 6 + %sum1 = add i8 %v1, %v2 + %sum2 = add i8 %v3, %v4 + %sum = add i8 %sum1, %sum2 + ret i8 %sum +} + +define i8 @ld_global_v16i8(ptr addrspace(1) %ptr) { +; CHECK-LABEL: ld_global_v16i8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<16>; +; CHECK-NEXT: .reg .b32 %r<15>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v16i8_param_0]; +; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: bfe.u32 %r6, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs2, %r6; +; CHECK-NEXT: bfe.u32 %r7, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: bfe.u32 %r8, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs4, %r8; +; CHECK-NEXT: bfe.u32 %r9, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r9; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs6, %r10; +; CHECK-NEXT: bfe.u32 %r11, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r11; +; CHECK-NEXT: bfe.u32 %r12, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs8, %r12; +; CHECK-NEXT: add.s16 %rs9, %rs8, %rs7; +; CHECK-NEXT: add.s16 %rs10, %rs6, %rs5; +; CHECK-NEXT: add.s16 %rs11, %rs4, %rs3; +; CHECK-NEXT: add.s16 %rs12, %rs2, %rs1; +; CHECK-NEXT: add.s16 %rs13, %rs9, %rs10; +; CHECK-NEXT: add.s16 %rs14, %rs11, %rs12; +; CHECK-NEXT: add.s16 %rs15, %rs13, %rs14; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs15; +; CHECK-NEXT: and.b32 %r14, %r13, 255; +; CHECK-NEXT: st.param.b32 [func_retval0], %r14; +; CHECK-NEXT: ret; + %a = load <16 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <16 x i8> %a, i32 0 + %v2 = extractelement <16 x i8> %a, i32 2 + %v3 = extractelement <16 x i8> %a, i32 4 + %v4 = extractelement <16 x i8> %a, i32 6 + %v5 = extractelement <16 x i8> %a, i32 8 + %v6 = extractelement <16 x i8> %a, i32 10 + %v7 = extractelement <16 x i8> %a, i32 12 + %v8 = extractelement <16 x i8> %a, i32 14 + %sum1 = add i8 %v1, %v2 + %sum2 = add i8 %v3, %v4 + %sum3 = add i8 %v5, %v6 + %sum4 = add i8 %v7, %v8 + %sum5 = add i8 %sum1, %sum2 + %sum6 = add i8 %sum3, %sum4 + %sum7 = add i8 %sum5, %sum6 + ret i8 %sum7 +} + define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) { -; CHECK: ld.global.nc.v2.{{[a-z]}}32 +; CHECK-LABEL: ld_global_v2i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<4>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v2i32_param_0]; +; CHECK-NEXT: ld.global.nc.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r1, %r2; +; CHECK-NEXT: st.param.b32 [func_retval0], %r3; +; CHECK-NEXT: ret; %a = load <2 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <2 x i32> %a, i32 0 %v2 = extractelement <2 x i32> %a, i32 1 @@ -66,9 +238,20 @@ define i32 @ld_global_v2i32(ptr addrspace(1) %ptr) { ret i32 %sum } -; CHECK-LABEL: @ld_global_v4i32 define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) { -; CHECK: ld.global.nc.v4.{{[a-z]}}32 +; CHECK-LABEL: ld_global_v4i32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<8>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_global_v4i32_param_0]; +; CHECK-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r1, %r2; +; CHECK-NEXT: add.s32 %r6, %r3, %r4; +; CHECK-NEXT: add.s32 %r7, %r5, %r6; +; CHECK-NEXT: st.param.b32 [func_retval0], %r7; +; CHECK-NEXT: ret; %a = load <4 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <4 x i32> %a, i32 0 %v2 = extractelement <4 x i32> %a, i32 1 @@ -80,16 +263,32 @@ define i32 @ld_global_v4i32(ptr addrspace(1) %ptr) { ret i32 %sum3 } -; CHECK-LABEL: @ld_not_invariant define i32 @ld_not_invariant(ptr addrspace(1) %ptr) { -; CHECK: ld.global.{{[a-z]}}32 +; CHECK-LABEL: ld_not_invariant( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_invariant_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %a = load i32, ptr addrspace(1) %ptr ret i32 %a } -; CHECK-LABEL: @ld_not_global_addrspace define i32 @ld_not_global_addrspace(ptr addrspace(0) %ptr) { -; CHECK: ld.{{[a-z]}}32 +; CHECK-LABEL: ld_not_global_addrspace( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<2>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [ld_not_global_addrspace_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: st.param.b32 [func_retval0], %r1; +; CHECK-NEXT: ret; %a = load i32, ptr addrspace(0) %ptr ret i32 %a } diff --git a/llvm/test/CodeGen/NVPTX/load-store.ll b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll similarity index 63% rename from llvm/test/CodeGen/NVPTX/load-store.ll rename to llvm/test/CodeGen/NVPTX/load-store-scalars.ll index 2582595ad7ca5..ed94cb416f472 100644 --- a/llvm/test/CodeGen/NVPTX/load-store.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-scalars.ll @@ -4,18 +4,10 @@ ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | FileCheck %s -check-prefixes=CHECK,SM70 ; RUN: %if ptxas-12.2 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_70 -mattr=+ptx82 | %ptxas-verify -arch=sm_70 %} -; TODO: add i1, <8 x i8>, and <6 x i8> vector tests. - -; TODO: add test for vectors that exceed 128-bit length -; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors -; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. - ; TODO: generate PTX that preserves Concurrent Forward Progress ; for atomic operations to local statespace ; by generating atomic or volatile operations. -; TODO: design exposure for atomic operations on vector types. - ; TODO: add weak,atomic,volatile,atomic volatile tests ; for .const and .param statespaces. @@ -132,228 +124,6 @@ define void @generic_double(ptr %a) { ret void } -; TODO: make the lowering of this weak vector ops consistent with -; the ones of the next tests. This test lowers to a weak PTX -; vector op, but next test lowers to a vector PTX op. -define void @generic_2xi8(ptr %a) { -; CHECK-LABEL: generic_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; -; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i8>, ptr %a - %a.add = add <2 x i8> %a.load, - store <2 x i8> %a.add, ptr %a - ret void -} - -; TODO: make the lowering of this weak vector ops consistent with -; the ones of the previous test. This test lowers to a weak -; PTX scalar op, but prior test lowers to a vector PTX op. -define void @generic_4xi8(ptr %a) { -; CHECK-LABEL: generic_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load <4 x i8>, ptr %a - %a.add = add <4 x i8> %a.load, - store <4 x i8> %a.add, ptr %a - ret void -} - -define void @generic_2xi16(ptr %a) { -; CHECK-LABEL: generic_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; -; CHECK-NEXT: ld.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load <2 x i16>, ptr %a - %a.add = add <2 x i16> %a.load, - store <2 x i16> %a.add, ptr %a - ret void -} - -define void @generic_4xi16(ptr %a) { -; CHECK-LABEL: generic_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; -; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i16>, ptr %a - %a.add = add <4 x i16> %a.load, - store <4 x i16> %a.add, ptr %a - ret void -} - -define void @generic_2xi32(ptr %a) { -; CHECK-LABEL: generic_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; -; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i32>, ptr %a - %a.add = add <2 x i32> %a.load, - store <2 x i32> %a.add, ptr %a - ret void -} - -define void @generic_4xi32(ptr %a) { -; CHECK-LABEL: generic_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; -; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i32>, ptr %a - %a.add = add <4 x i32> %a.load, - store <4 x i32> %a.add, ptr %a - ret void -} - -define void @generic_2xi64(ptr %a) { -; CHECK-LABEL: generic_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; -; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load <2 x i64>, ptr %a - %a.add = add <2 x i64> %a.load, - store <2 x i64> %a.add, ptr %a - ret void -} - -define void @generic_2xfloat(ptr %a) { -; CHECK-LABEL: generic_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; -; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load <2 x float>, ptr %a - %a.add = fadd <2 x float> %a.load, - store <2 x float> %a.add, ptr %a - ret void -} - -define void @generic_4xfloat(ptr %a) { -; CHECK-LABEL: generic_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; -; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load <4 x float>, ptr %a - %a.add = fadd <4 x float> %a.load, - store <4 x float> %a.add, ptr %a - ret void -} - -define void @generic_2xdouble(ptr %a) { -; CHECK-LABEL: generic_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; -; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load <2 x double>, ptr %a - %a.add = fadd <2 x double> %a.load, - store <2 x double> %a.add, ptr %a - ret void -} - ; generic_volatile define void @generic_volatile_i8(ptr %a) { @@ -463,241 +233,6 @@ define void @generic_volatile_double(ptr %a) { ret void } -; TODO: volatile, atomic, and volatile atomic memory operations on vector types. -; Currently, LLVM: -; - does not allow atomic operations on vectors. -; - it allows volatile operations but not clear what that means. -; Following both semantics make sense in general and PTX supports both: -; - volatile/atomic/volatile atomic applies to the whole vector -; - volatile/atomic/volatile atomic applies elementwise -; Actions required: -; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those -; Below tests show that the current implementation picks the semantics in an inconsistent way -; * volatile <2 x i8> lowers to "elementwise volatile" -; * <4 x i8> lowers to "full vector volatile" -; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics -; - update tests in load-store-sm70.ll as well. - -; TODO: make this operation consistent with the one for <4 x i8> -; This operation lowers to a "element wise volatile PTX operation". -define void @generic_volatile_2xi8(ptr %a) { -; CHECK-LABEL: generic_volatile_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i8>, ptr %a - %a.add = add <2 x i8> %a.load, - store volatile <2 x i8> %a.add, ptr %a - ret void -} - -; TODO: make this operation consistent with the one for <2 x i8> -; This operation lowers to a "full vector volatile PTX operation". -define void @generic_volatile_4xi8(ptr %a) { -; CHECK-LABEL: generic_volatile_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i8>, ptr %a - %a.add = add <4 x i8> %a.load, - store volatile <4 x i8> %a.add, ptr %a - ret void -} - -define void @generic_volatile_2xi16(ptr %a) { -; CHECK-LABEL: generic_volatile_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i16>, ptr %a - %a.add = add <2 x i16> %a.load, - store volatile <2 x i16> %a.add, ptr %a - ret void -} - -define void @generic_volatile_4xi16(ptr %a) { -; CHECK-LABEL: generic_volatile_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i16>, ptr %a - %a.add = add <4 x i16> %a.load, - store volatile <4 x i16> %a.add, ptr %a - ret void -} - -define void @generic_volatile_2xi32(ptr %a) { -; CHECK-LABEL: generic_volatile_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i32>, ptr %a - %a.add = add <2 x i32> %a.load, - store volatile <2 x i32> %a.add, ptr %a - ret void -} - -define void @generic_volatile_4xi32(ptr %a) { -; CHECK-LABEL: generic_volatile_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i32>, ptr %a - %a.add = add <4 x i32> %a.load, - store volatile <4 x i32> %a.add, ptr %a - ret void -} - -define void @generic_volatile_2xi64(ptr %a) { -; CHECK-LABEL: generic_volatile_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i64>, ptr %a - %a.add = add <2 x i64> %a.load, - store volatile <2 x i64> %a.add, ptr %a - ret void -} - -define void @generic_volatile_2xfloat(ptr %a) { -; CHECK-LABEL: generic_volatile_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x float>, ptr %a - %a.add = fadd <2 x float> %a.load, - store volatile <2 x float> %a.add, ptr %a - ret void -} - -define void @generic_volatile_4xfloat(ptr %a) { -; CHECK-LABEL: generic_volatile_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x float>, ptr %a - %a.add = fadd <4 x float> %a.load, - store volatile <4 x float> %a.add, ptr %a - ret void -} - -define void @generic_volatile_2xdouble(ptr %a) { -; CHECK-LABEL: generic_volatile_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x double>, ptr %a - %a.add = fadd <2 x double> %a.load, - store volatile <2 x double> %a.add, ptr %a - ret void -} - ; generic_unordered_sys define void @generic_unordered_sys_i8(ptr %a) { @@ -1387,222 +922,6 @@ define void @global_double(ptr addrspace(1) %a) { ret void } -define void @global_2xi8(ptr addrspace(1) %a) { -; CHECK-LABEL: global_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; -; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i8>, ptr addrspace(1) %a - %a.add = add <2 x i8> %a.load, - store <2 x i8> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_4xi8(ptr addrspace(1) %a) { -; CHECK-LABEL: global_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; -; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.global.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load <4 x i8>, ptr addrspace(1) %a - %a.add = add <4 x i8> %a.load, - store <4 x i8> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_2xi16(ptr addrspace(1) %a) { -; CHECK-LABEL: global_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; -; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.global.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load <2 x i16>, ptr addrspace(1) %a - %a.add = add <2 x i16> %a.load, - store <2 x i16> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_4xi16(ptr addrspace(1) %a) { -; CHECK-LABEL: global_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; -; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i16>, ptr addrspace(1) %a - %a.add = add <4 x i16> %a.load, - store <4 x i16> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_2xi32(ptr addrspace(1) %a) { -; CHECK-LABEL: global_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; -; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i32>, ptr addrspace(1) %a - %a.add = add <2 x i32> %a.load, - store <2 x i32> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_4xi32(ptr addrspace(1) %a) { -; CHECK-LABEL: global_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; -; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i32>, ptr addrspace(1) %a - %a.add = add <4 x i32> %a.load, - store <4 x i32> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_2xi64(ptr addrspace(1) %a) { -; CHECK-LABEL: global_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; -; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load <2 x i64>, ptr addrspace(1) %a - %a.add = add <2 x i64> %a.load, - store <2 x i64> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_2xfloat(ptr addrspace(1) %a) { -; CHECK-LABEL: global_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; -; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load <2 x float>, ptr addrspace(1) %a - %a.add = fadd <2 x float> %a.load, - store <2 x float> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_4xfloat(ptr addrspace(1) %a) { -; CHECK-LABEL: global_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; -; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load <4 x float>, ptr addrspace(1) %a - %a.add = fadd <4 x float> %a.load, - store <4 x float> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_2xdouble(ptr addrspace(1) %a) { -; CHECK-LABEL: global_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; -; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load <2 x double>, ptr addrspace(1) %a - %a.add = fadd <2 x double> %a.load, - store <2 x double> %a.add, ptr addrspace(1) %a - ret void -} - ; global_volatile define void @global_volatile_i8(ptr addrspace(1) %a) { @@ -1712,222 +1031,6 @@ define void @global_volatile_double(ptr addrspace(1) %a) { ret void } -define void @global_volatile_2xi8(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i8>, ptr addrspace(1) %a - %a.add = add <2 x i8> %a.load, - store volatile <2 x i8> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_4xi8(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i8>, ptr addrspace(1) %a - %a.add = add <4 x i8> %a.load, - store volatile <4 x i8> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_2xi16(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i16>, ptr addrspace(1) %a - %a.add = add <2 x i16> %a.load, - store volatile <2 x i16> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_4xi16(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i16>, ptr addrspace(1) %a - %a.add = add <4 x i16> %a.load, - store volatile <4 x i16> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_2xi32(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i32>, ptr addrspace(1) %a - %a.add = add <2 x i32> %a.load, - store volatile <2 x i32> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_4xi32(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i32>, ptr addrspace(1) %a - %a.add = add <4 x i32> %a.load, - store volatile <4 x i32> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_2xi64(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i64>, ptr addrspace(1) %a - %a.add = add <2 x i64> %a.load, - store volatile <2 x i64> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_2xfloat(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x float>, ptr addrspace(1) %a - %a.add = fadd <2 x float> %a.load, - store volatile <2 x float> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_4xfloat(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x float>, ptr addrspace(1) %a - %a.add = fadd <4 x float> %a.load, - store volatile <4 x float> %a.add, ptr addrspace(1) %a - ret void -} - -define void @global_volatile_2xdouble(ptr addrspace(1) %a) { -; CHECK-LABEL: global_volatile_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x double>, ptr addrspace(1) %a - %a.add = fadd <2 x double> %a.load, - store volatile <2 x double> %a.add, ptr addrspace(1) %a - ret void -} - ; global_unordered_sys define void @global_unordered_sys_i8(ptr addrspace(1) %a) { @@ -2759,222 +1862,6 @@ define void @shared_double(ptr addrspace(3) %a) { ret void } -define void @shared_2xi8(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; -; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i8>, ptr addrspace(3) %a - %a.add = add <2 x i8> %a.load, - store <2 x i8> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_4xi8(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; -; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.shared.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load <4 x i8>, ptr addrspace(3) %a - %a.add = add <4 x i8> %a.load, - store <4 x i8> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_2xi16(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; -; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.shared.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load <2 x i16>, ptr addrspace(3) %a - %a.add = add <2 x i16> %a.load, - store <2 x i16> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_4xi16(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; -; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i16>, ptr addrspace(3) %a - %a.add = add <4 x i16> %a.load, - store <4 x i16> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_2xi32(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; -; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i32>, ptr addrspace(3) %a - %a.add = add <2 x i32> %a.load, - store <2 x i32> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_4xi32(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; -; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i32>, ptr addrspace(3) %a - %a.add = add <4 x i32> %a.load, - store <4 x i32> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_2xi64(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; -; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load <2 x i64>, ptr addrspace(3) %a - %a.add = add <2 x i64> %a.load, - store <2 x i64> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_2xfloat(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; -; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load <2 x float>, ptr addrspace(3) %a - %a.add = fadd <2 x float> %a.load, - store <2 x float> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_4xfloat(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; -; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load <4 x float>, ptr addrspace(3) %a - %a.add = fadd <4 x float> %a.load, - store <4 x float> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_2xdouble(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; -; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load <2 x double>, ptr addrspace(3) %a - %a.add = fadd <2 x double> %a.load, - store <2 x double> %a.add, ptr addrspace(3) %a - ret void -} - ; shared_volatile define void @shared_volatile_i8(ptr addrspace(3) %a) { @@ -3084,222 +1971,6 @@ define void @shared_volatile_double(ptr addrspace(3) %a) { ret void } -define void @shared_volatile_2xi8(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i8>, ptr addrspace(3) %a - %a.add = add <2 x i8> %a.load, - store volatile <2 x i8> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_4xi8(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i8>, ptr addrspace(3) %a - %a.add = add <4 x i8> %a.load, - store volatile <4 x i8> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_2xi16(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i16>, ptr addrspace(3) %a - %a.add = add <2 x i16> %a.load, - store volatile <2 x i16> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_4xi16(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i16>, ptr addrspace(3) %a - %a.add = add <4 x i16> %a.load, - store volatile <4 x i16> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_2xi32(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i32>, ptr addrspace(3) %a - %a.add = add <2 x i32> %a.load, - store volatile <2 x i32> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_4xi32(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i32>, ptr addrspace(3) %a - %a.add = add <4 x i32> %a.load, - store volatile <4 x i32> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_2xi64(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i64>, ptr addrspace(3) %a - %a.add = add <2 x i64> %a.load, - store volatile <2 x i64> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x float>, ptr addrspace(3) %a - %a.add = fadd <2 x float> %a.load, - store volatile <2 x float> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x float>, ptr addrspace(3) %a - %a.add = fadd <4 x float> %a.load, - store volatile <4 x float> %a.add, ptr addrspace(3) %a - ret void -} - -define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { -; CHECK-LABEL: shared_volatile_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x double>, ptr addrspace(3) %a - %a.add = fadd <2 x double> %a.load, - store volatile <2 x double> %a.add, ptr addrspace(3) %a - ret void -} - ; shared_unordered_sys define void @shared_unordered_sys_i8(ptr addrspace(3) %a) { @@ -3989,222 +2660,6 @@ define void @local_double(ptr addrspace(5) %a) { ret void } -define void @local_2xi8(ptr addrspace(5) %a) { -; CHECK-LABEL: local_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; -; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i8>, ptr addrspace(5) %a - %a.add = add <2 x i8> %a.load, - store <2 x i8> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_4xi8(ptr addrspace(5) %a) { -; CHECK-LABEL: local_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.local.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load <4 x i8>, ptr addrspace(5) %a - %a.add = add <4 x i8> %a.load, - store <4 x i8> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_2xi16(ptr addrspace(5) %a) { -; CHECK-LABEL: local_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load <2 x i16>, ptr addrspace(5) %a - %a.add = add <2 x i16> %a.load, - store <2 x i16> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_4xi16(ptr addrspace(5) %a) { -; CHECK-LABEL: local_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; -; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i16>, ptr addrspace(5) %a - %a.add = add <4 x i16> %a.load, - store <4 x i16> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_2xi32(ptr addrspace(5) %a) { -; CHECK-LABEL: local_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; -; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load <2 x i32>, ptr addrspace(5) %a - %a.add = add <2 x i32> %a.load, - store <2 x i32> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_4xi32(ptr addrspace(5) %a) { -; CHECK-LABEL: local_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; -; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load <4 x i32>, ptr addrspace(5) %a - %a.add = add <4 x i32> %a.load, - store <4 x i32> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_2xi64(ptr addrspace(5) %a) { -; CHECK-LABEL: local_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; -; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load <2 x i64>, ptr addrspace(5) %a - %a.add = add <2 x i64> %a.load, - store <2 x i64> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_2xfloat(ptr addrspace(5) %a) { -; CHECK-LABEL: local_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load <2 x float>, ptr addrspace(5) %a - %a.add = fadd <2 x float> %a.load, - store <2 x float> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_4xfloat(ptr addrspace(5) %a) { -; CHECK-LABEL: local_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load <4 x float>, ptr addrspace(5) %a - %a.add = fadd <4 x float> %a.load, - store <4 x float> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_2xdouble(ptr addrspace(5) %a) { -; CHECK-LABEL: local_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; -; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load <2 x double>, ptr addrspace(5) %a - %a.add = fadd <2 x double> %a.load, - store <2 x double> %a.add, ptr addrspace(5) %a - ret void -} - ; local_volatile define void @local_volatile_i8(ptr addrspace(5) %a) { @@ -4314,222 +2769,6 @@ define void @local_volatile_double(ptr addrspace(5) %a) { ret void } -define void @local_volatile_2xi8(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_2xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; -; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i8>, ptr addrspace(5) %a - %a.add = add <2 x i8> %a.load, - store volatile <2 x i8> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_4xi8(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_4xi8( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b32 %r<13>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; -; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; -; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; -; CHECK-NEXT: add.s16 %rs2, %rs1, 1; -; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; -; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; -; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; -; CHECK-NEXT: add.s16 %rs4, %rs3, 1; -; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; -; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; -; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; -; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; -; CHECK-NEXT: add.s16 %rs6, %rs5, 1; -; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; -; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; -; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; -; CHECK-NEXT: add.s16 %rs8, %rs7, 1; -; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; -; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; -; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; -; CHECK-NEXT: st.local.u32 [%rd1], %r12; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i8>, ptr addrspace(5) %a - %a.add = add <4 x i8> %a.load, - store volatile <4 x i8> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_2xi16(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_2xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<5>; -; CHECK-NEXT: .reg .b32 %r<3>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; -; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; -; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; -; CHECK-NEXT: add.s16 %rs3, %rs2, 1; -; CHECK-NEXT: add.s16 %rs4, %rs1, 1; -; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; -; CHECK-NEXT: st.local.u32 [%rd1], %r2; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i16>, ptr addrspace(5) %a - %a.add = add <2 x i16> %a.load, - store volatile <2 x i16> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_4xi16(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_4xi16( -; CHECK: { -; CHECK-NEXT: .reg .b16 %rs<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; -; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; -; CHECK-NEXT: add.s16 %rs5, %rs4, 1; -; CHECK-NEXT: add.s16 %rs6, %rs3, 1; -; CHECK-NEXT: add.s16 %rs7, %rs2, 1; -; CHECK-NEXT: add.s16 %rs8, %rs1, 1; -; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i16>, ptr addrspace(5) %a - %a.add = add <4 x i16> %a.load, - store volatile <4 x i16> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_2xi32(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_2xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; -; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; -; CHECK-NEXT: add.s32 %r3, %r2, 1; -; CHECK-NEXT: add.s32 %r4, %r1, 1; -; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i32>, ptr addrspace(5) %a - %a.add = add <2 x i32> %a.load, - store volatile <2 x i32> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_4xi32(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_4xi32( -; CHECK: { -; CHECK-NEXT: .reg .b32 %r<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; -; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: add.s32 %r5, %r4, 1; -; CHECK-NEXT: add.s32 %r6, %r3, 1; -; CHECK-NEXT: add.s32 %r7, %r2, 1; -; CHECK-NEXT: add.s32 %r8, %r1, 1; -; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x i32>, ptr addrspace(5) %a - %a.add = add <4 x i32> %a.load, - store volatile <4 x i32> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_2xi64(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_2xi64( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<6>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; -; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: add.s64 %rd4, %rd3, 1; -; CHECK-NEXT: add.s64 %rd5, %rd2, 1; -; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x i64>, ptr addrspace(5) %a - %a.add = add <2 x i64> %a.load, - store volatile <2 x i64> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_2xfloat(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_2xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<5>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; -; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x float>, ptr addrspace(5) %a - %a.add = fadd <2 x float> %a.load, - store volatile <2 x float> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_4xfloat(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_4xfloat( -; CHECK: { -; CHECK-NEXT: .reg .f32 %f<9>; -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; -; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; -; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; -; CHECK-NEXT: ret; - %a.load = load volatile <4 x float>, ptr addrspace(5) %a - %a.add = fadd <4 x float> %a.load, - store volatile <4 x float> %a.add, ptr addrspace(5) %a - ret void -} - -define void @local_volatile_2xdouble(ptr addrspace(5) %a) { -; CHECK-LABEL: local_volatile_2xdouble( -; CHECK: { -; CHECK-NEXT: .reg .b64 %rd<2>; -; CHECK-NEXT: .reg .f64 %fd<5>; -; CHECK-EMPTY: -; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; -; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; -; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; -; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; -; CHECK-NEXT: ret; - %a.load = load volatile <2 x double>, ptr addrspace(5) %a - %a.add = fadd <2 x double> %a.load, - store volatile <2 x double> %a.add, ptr addrspace(5) %a - ret void -} - ; local_unordered_sys define void @local_unordered_sys_i8(ptr addrspace(5) %a) { diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll new file mode 100644 index 0000000000000..ba397dca68f1b --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors.ll @@ -0,0 +1,3267 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 | FileCheck %s +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 | %ptxas-verify %} + +; TODO: add i1, and <6 x i8> vector tests. + +; TODO: add test for vectors that exceed 128-bit length +; Per https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#vectors +; vectors cannot exceed 128-bit in length, i.e., .v4.u64 is not allowed. + +; TODO: generate PTX that preserves Concurrent Forward Progress +; for atomic operations to local statespace +; by generating atomic or volatile operations. + +; TODO: design exposure for atomic operations on vector types. + +; TODO: add weak,atomic,volatile,atomic volatile tests +; for .const and .param statespaces. + +;; generic statespace + +; generic + +; TODO: make the lowering of this weak vector ops consistent with +; the ones of the next tests. This test lowers to a weak PTX +; vector op, but next test lowers to a vector PTX op. +define void @generic_2xi8(ptr %a) { +; CHECK-LABEL: generic_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi8_param_0]; +; CHECK-NEXT: ld.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr %a + %a.add = add <2 x i8> %a.load, + store <2 x i8> %a.add, ptr %a + ret void +} + +; TODO: make the lowering of this weak vector ops consistent with +; the ones of the previous test. This test lowers to a weak +; PTX scalar op, but prior test lowers to a vector PTX op. +define void @generic_4xi8(ptr %a) { +; CHECK-LABEL: generic_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi8_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr %a + %a.add = add <4 x i8> %a.load, + store <4 x i8> %a.add, ptr %a + ret void +} + +define void @generic_8xi8(ptr %a) { +; CHECK-LABEL: generic_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi8_param_0]; +; CHECK-NEXT: ld.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load <8 x i8>, ptr %a + %a.add = add <8 x i8> %a.load, + store <8 x i8> %a.add, ptr %a + ret void +} + +define void @generic_16xi8(ptr %a) { +; CHECK-LABEL: generic_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi8_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load <16 x i8>, ptr %a + %a.add = add <16 x i8> %a.load, + store <16 x i8> %a.add, ptr %a + ret void +} + +define void @generic_2xi16(ptr %a) { +; CHECK-LABEL: generic_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi16_param_0]; +; CHECK-NEXT: ld.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr %a + %a.add = add <2 x i16> %a.load, + store <2 x i16> %a.add, ptr %a + ret void +} + +define void @generic_4xi16(ptr %a) { +; CHECK-LABEL: generic_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi16_param_0]; +; CHECK-NEXT: ld.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr %a + %a.add = add <4 x i16> %a.load, + store <4 x i16> %a.add, ptr %a + ret void +} + +define void @generic_8xi16(ptr %a) { +; CHECK-LABEL: generic_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi16_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <8 x i16>, ptr %a + %a.add = add <8 x i16> %a.load, + store <8 x i16> %a.add, ptr %a + ret void +} + +define void @generic_2xi32(ptr %a) { +; CHECK-LABEL: generic_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi32_param_0]; +; CHECK-NEXT: ld.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr %a + %a.add = add <2 x i32> %a.load, + store <2 x i32> %a.add, ptr %a + ret void +} + +define void @generic_4xi32(ptr %a) { +; CHECK-LABEL: generic_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi32_param_0]; +; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr %a + %a.add = add <4 x i32> %a.load, + store <4 x i32> %a.add, ptr %a + ret void +} + +define void @generic_2xi64(ptr %a) { +; CHECK-LABEL: generic_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xi64_param_0]; +; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr %a + %a.add = add <2 x i64> %a.load, + store <2 x i64> %a.add, ptr %a + ret void +} + +define void @generic_2xfloat(ptr %a) { +; CHECK-LABEL: generic_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xfloat_param_0]; +; CHECK-NEXT: ld.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr %a + %a.add = fadd <2 x float> %a.load, + store <2 x float> %a.add, ptr %a + ret void +} + +define void @generic_4xfloat(ptr %a) { +; CHECK-LABEL: generic_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xfloat_param_0]; +; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr %a + %a.add = fadd <4 x float> %a.load, + store <4 x float> %a.add, ptr %a + ret void +} + +define void @generic_2xdouble(ptr %a) { +; CHECK-LABEL: generic_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_2xdouble_param_0]; +; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr %a + %a.add = fadd <2 x double> %a.load, + store <2 x double> %a.add, ptr %a + ret void +} + +; generic_volatile + +; TODO: volatile, atomic, and volatile atomic memory operations on vector types. +; Currently, LLVM: +; - does not allow atomic operations on vectors. +; - it allows volatile operations but not clear what that means. +; Following both semantics make sense in general and PTX supports both: +; - volatile/atomic/volatile atomic applies to the whole vector +; - volatile/atomic/volatile atomic applies elementwise +; Actions required: +; - clarify LLVM semantics for volatile on vectors and align the NVPTX backend with those +; Below tests show that the current implementation picks the semantics in an inconsistent way +; * volatile <2 x i8> lowers to "elementwise volatile" +; * <4 x i8> lowers to "full vector volatile" +; - provide support for vector atomics, e.g., by extending LLVM IR or via intrinsics +; - update tests in load-store-sm70.ll as well. + +; TODO: make this operation consistent with the one for <4 x i8> +; This operation lowers to a "element wise volatile PTX operation". +define void @generic_volatile_2xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr %a + %a.add = add <2 x i8> %a.load, + store volatile <2 x i8> %a.add, ptr %a + ret void +} + +; TODO: make this operation consistent with the one for <2 x i8> +; This operation lowers to a "full vector volatile PTX operation". +define void @generic_volatile_4xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr %a + %a.add = add <4 x i8> %a.load, + store volatile <4 x i8> %a.add, ptr %a + ret void +} + +define void @generic_volatile_8xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.volatile.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.volatile.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i8>, ptr %a + %a.add = add <8 x i8> %a.load, + store volatile <8 x i8> %a.add, ptr %a + ret void +} + +define void @generic_volatile_16xi8(ptr %a) { +; CHECK-LABEL: generic_volatile_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i8>, ptr %a + %a.add = add <16 x i8> %a.load, + store volatile <16 x i8> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr %a + %a.add = add <2 x i16> %a.load, + store volatile <2 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr %a + %a.add = add <4 x i16> %a.load, + store volatile <4 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_8xi16(ptr %a) { +; CHECK-LABEL: generic_volatile_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i16>, ptr %a + %a.add = add <8 x i16> %a.load, + store volatile <8 x i16> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi32(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr %a + %a.add = add <2 x i32> %a.load, + store volatile <2 x i32> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xi32(ptr %a) { +; CHECK-LABEL: generic_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr %a + %a.add = add <4 x i32> %a.load, + store volatile <4 x i32> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xi64(ptr %a) { +; CHECK-LABEL: generic_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr %a + %a.add = add <2 x i64> %a.load, + store volatile <2 x i64> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xfloat(ptr %a) { +; CHECK-LABEL: generic_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr %a + %a.add = fadd <2 x float> %a.load, + store volatile <2 x float> %a.add, ptr %a + ret void +} + +define void @generic_volatile_4xfloat(ptr %a) { +; CHECK-LABEL: generic_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr %a + %a.add = fadd <4 x float> %a.load, + store volatile <4 x float> %a.add, ptr %a + ret void +} + +define void @generic_volatile_2xdouble(ptr %a) { +; CHECK-LABEL: generic_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr %a + %a.add = fadd <2 x double> %a.load, + store volatile <2 x double> %a.add, ptr %a + ret void +} + +;; global statespace + +; global + +define void @global_2xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi8_param_0]; +; CHECK-NEXT: ld.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(1) %a + %a.add = add <2 x i8> %a.load, + store <2 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi8_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.global.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(1) %a + %a.add = add <4 x i8> %a.load, + store <4 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_8xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi8_param_0]; +; CHECK-NEXT: ld.global.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.global.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load <8 x i8>, ptr addrspace(1) %a + %a.add = add <8 x i8> %a.load, + store <8 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_16xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_16xi8_param_0]; +; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load <16 x i8>, ptr addrspace(1) %a + %a.add = add <16 x i8> %a.load, + store <16 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi16_param_0]; +; CHECK-NEXT: ld.global.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(1) %a + %a.add = add <2 x i16> %a.load, + store <2 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi16_param_0]; +; CHECK-NEXT: ld.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(1) %a + %a.add = add <4 x i16> %a.load, + store <4 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_8xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_8xi16_param_0]; +; CHECK-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <8 x i16>, ptr addrspace(1) %a + %a.add = add <8 x i16> %a.load, + store <8 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi32_param_0]; +; CHECK-NEXT: ld.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(1) %a + %a.add = add <2 x i32> %a.load, + store <2 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xi32_param_0]; +; CHECK-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(1) %a + %a.add = add <4 x i32> %a.load, + store <4 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xi64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xi64_param_0]; +; CHECK-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(1) %a + %a.add = add <2 x i64> %a.load, + store <2 x i64> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xfloat_param_0]; +; CHECK-NEXT: ld.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(1) %a + %a.add = fadd <2 x float> %a.load, + store <2 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_4xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_4xfloat_param_0]; +; CHECK-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(1) %a + %a.add = fadd <4 x float> %a.load, + store <4 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_2xdouble(ptr addrspace(1) %a) { +; CHECK-LABEL: global_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_2xdouble_param_0]; +; CHECK-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(1) %a + %a.add = fadd <2 x double> %a.load, + store <2 x double> %a.add, ptr addrspace(1) %a + ret void +} + +; global_volatile + +define void @global_volatile_2xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.global.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(1) %a + %a.add = add <2 x i8> %a.load, + store volatile <2 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(1) %a + %a.add = add <4 x i8> %a.load, + store volatile <4 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_8xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.volatile.global.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i8>, ptr addrspace(1) %a + %a.add = add <8 x i8> %a.load, + store volatile <8 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_16xi8(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i8>, ptr addrspace(1) %a + %a.add = add <16 x i8> %a.load, + store volatile <16 x i8> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.global.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(1) %a + %a.add = add <2 x i16> %a.load, + store volatile <2 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.global.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(1) %a + %a.add = add <4 x i16> %a.load, + store volatile <4 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_8xi16(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.volatile.global.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i16>, ptr addrspace(1) %a + %a.add = add <8 x i16> %a.load, + store volatile <8 x i16> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.global.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(1) %a + %a.add = add <2 x i32> %a.load, + store volatile <2 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xi32(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.global.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(1) %a + %a.add = add <4 x i32> %a.load, + store volatile <4 x i32> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xi64(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.global.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(1) %a + %a.add = add <2 x i64> %a.load, + store volatile <2 x i64> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(1) %a + %a.add = fadd <2 x float> %a.load, + store volatile <2 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_4xfloat(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.global.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(1) %a + %a.add = fadd <4 x float> %a.load, + store volatile <4 x float> %a.add, ptr addrspace(1) %a + ret void +} + +define void @global_volatile_2xdouble(ptr addrspace(1) %a) { +; CHECK-LABEL: global_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [global_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.global.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(1) %a + %a.add = fadd <2 x double> %a.load, + store volatile <2 x double> %a.add, ptr addrspace(1) %a + ret void +} + +;; shared statespace + +; shared + +define void @shared_2xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi8_param_0]; +; CHECK-NEXT: ld.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(3) %a + %a.add = add <2 x i8> %a.load, + store <2 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi8_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.shared.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(3) %a + %a.add = add <4 x i8> %a.load, + store <4 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_8xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi8_param_0]; +; CHECK-NEXT: ld.shared.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.shared.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load <8 x i8>, ptr addrspace(3) %a + %a.add = add <8 x i8> %a.load, + store <8 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_16xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi8_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load <16 x i8>, ptr addrspace(3) %a + %a.add = add <16 x i8> %a.load, + store <16 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi16_param_0]; +; CHECK-NEXT: ld.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(3) %a + %a.add = add <2 x i16> %a.load, + store <2 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi16_param_0]; +; CHECK-NEXT: ld.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(3) %a + %a.add = add <4 x i16> %a.load, + store <4 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_8xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi16_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <8 x i16>, ptr addrspace(3) %a + %a.add = add <8 x i16> %a.load, + store <8 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi32_param_0]; +; CHECK-NEXT: ld.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(3) %a + %a.add = add <2 x i32> %a.load, + store <2 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi32_param_0]; +; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(3) %a + %a.add = add <4 x i32> %a.load, + store <4 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xi64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xi64_param_0]; +; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(3) %a + %a.add = add <2 x i64> %a.load, + store <2 x i64> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xfloat_param_0]; +; CHECK-NEXT: ld.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(3) %a + %a.add = fadd <2 x float> %a.load, + store <2 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_4xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(3) %a + %a.add = fadd <4 x float> %a.load, + store <4 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_2xdouble(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_2xdouble_param_0]; +; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(3) %a + %a.add = fadd <2 x double> %a.load, + store <2 x double> %a.add, ptr addrspace(3) %a + ret void +} + +; shared_volatile + +define void @shared_volatile_2xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(3) %a + %a.add = add <2 x i8> %a.load, + store volatile <2 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(3) %a + %a.add = add <4 x i8> %a.load, + store volatile <4 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_8xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.volatile.shared.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i8>, ptr addrspace(3) %a + %a.add = add <8 x i8> %a.load, + store volatile <8 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_16xi8(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i8>, ptr addrspace(3) %a + %a.add = add <16 x i8> %a.load, + store volatile <16 x i8> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.volatile.shared.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(3) %a + %a.add = add <2 x i16> %a.load, + store volatile <2 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.volatile.shared.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(3) %a + %a.add = add <4 x i16> %a.load, + store volatile <4 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_8xi16(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i16>, ptr addrspace(3) %a + %a.add = add <8 x i16> %a.load, + store volatile <8 x i16> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.volatile.shared.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(3) %a + %a.add = add <2 x i32> %a.load, + store volatile <2 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xi32(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(3) %a + %a.add = add <4 x i32> %a.load, + store volatile <4 x i32> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xi64(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(3) %a + %a.add = add <2 x i64> %a.load, + store volatile <2 x i64> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(3) %a + %a.add = fadd <2 x float> %a.load, + store volatile <2 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_4xfloat(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(3) %a + %a.add = fadd <4 x float> %a.load, + store volatile <4 x float> %a.add, ptr addrspace(3) %a + ret void +} + +define void @shared_volatile_2xdouble(ptr addrspace(3) %a) { +; CHECK-LABEL: shared_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(3) %a + %a.add = fadd <2 x double> %a.load, + store volatile <2 x double> %a.add, ptr addrspace(3) %a + ret void +} + +;; local statespace + +; local + +define void @local_2xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i8>, ptr addrspace(5) %a + %a.add = add <2 x i8> %a.load, + store <2 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi8_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load <4 x i8>, ptr addrspace(5) %a + %a.add = add <4 x i8> %a.load, + store <4 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_8xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi8_param_0]; +; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load <8 x i8>, ptr addrspace(5) %a + %a.add = add <8 x i8> %a.load, + store <8 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_16xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi8_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load <16 x i8>, ptr addrspace(5) %a + %a.add = add <16 x i8> %a.load, + store <16 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi16_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load <2 x i16>, ptr addrspace(5) %a + %a.add = add <2 x i16> %a.load, + store <2 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i16>, ptr addrspace(5) %a + %a.add = add <4 x i16> %a.load, + store <4 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_8xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi16_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <8 x i16>, ptr addrspace(5) %a + %a.add = add <8 x i16> %a.load, + store <8 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load <2 x i32>, ptr addrspace(5) %a + %a.add = add <2 x i32> %a.load, + store <2 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load <4 x i32>, ptr addrspace(5) %a + %a.add = add <4 x i32> %a.load, + store <4 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xi64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load <2 x i64>, ptr addrspace(5) %a + %a.add = add <2 x i64> %a.load, + store <2 x i64> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load <2 x float>, ptr addrspace(5) %a + %a.add = fadd <2 x float> %a.load, + store <2 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_4xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load <4 x float>, ptr addrspace(5) %a + %a.add = fadd <4 x float> %a.load, + store <4 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_2xdouble(ptr addrspace(5) %a) { +; CHECK-LABEL: local_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load <2 x double>, ptr addrspace(5) %a + %a.add = fadd <2 x double> %a.load, + store <2 x double> %a.add, ptr addrspace(5) %a + ret void +} + +; local_volatile + +define void @local_volatile_2xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi8_param_0]; +; CHECK-NEXT: ld.local.v2.u8 {%rs1, %rs2}, [%rd1]; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: st.local.v2.u8 [%rd1], {%rs4, %rs3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i8>, ptr addrspace(5) %a + %a.add = add <2 x i8> %a.load, + store volatile <2 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b32 %r<13>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi8_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: bfe.u32 %r2, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r2; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-NEXT: bfe.u32 %r4, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r4; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r5, %rs4; +; CHECK-NEXT: prmt.b32 %r6, %r5, %r3, 0x3340U; +; CHECK-NEXT: bfe.u32 %r7, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r7; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs6; +; CHECK-NEXT: bfe.u32 %r9, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r9; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r10, %rs8; +; CHECK-NEXT: prmt.b32 %r11, %r10, %r8, 0x3340U; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r6, 0x5410U; +; CHECK-NEXT: st.local.u32 [%rd1], %r12; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i8>, ptr addrspace(5) %a + %a.add = add <4 x i8> %a.load, + store volatile <4 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_8xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_8xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<25>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi8_param_0]; +; CHECK-NEXT: ld.local.v2.b32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r3, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r3; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-NEXT: bfe.u32 %r5, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r5; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs4; +; CHECK-NEXT: prmt.b32 %r7, %r6, %r4, 0x3340U; +; CHECK-NEXT: bfe.u32 %r8, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r8; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r9, %rs6; +; CHECK-NEXT: bfe.u32 %r10, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r10; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs8; +; CHECK-NEXT: prmt.b32 %r12, %r11, %r9, 0x3340U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r7, 0x5410U; +; CHECK-NEXT: bfe.u32 %r14, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r14; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r15, %rs10; +; CHECK-NEXT: bfe.u32 %r16, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r16; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs12; +; CHECK-NEXT: prmt.b32 %r18, %r17, %r15, 0x3340U; +; CHECK-NEXT: bfe.u32 %r19, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r19; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r20, %rs14; +; CHECK-NEXT: bfe.u32 %r21, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r21; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs16; +; CHECK-NEXT: prmt.b32 %r23, %r22, %r20, 0x3340U; +; CHECK-NEXT: prmt.b32 %r24, %r23, %r18, 0x5410U; +; CHECK-NEXT: st.local.v2.b32 [%rd1], {%r24, %r13}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i8>, ptr addrspace(5) %a + %a.add = add <8 x i8> %a.load, + store volatile <8 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_16xi8(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_16xi8( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<33>; +; CHECK-NEXT: .reg .b32 %r<49>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi8_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: bfe.u32 %r5, %r4, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs1, %r5; +; CHECK-NEXT: add.s16 %rs2, %rs1, 1; +; CHECK-NEXT: cvt.u32.u16 %r6, %rs2; +; CHECK-NEXT: bfe.u32 %r7, %r4, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs3, %r7; +; CHECK-NEXT: add.s16 %rs4, %rs3, 1; +; CHECK-NEXT: cvt.u32.u16 %r8, %rs4; +; CHECK-NEXT: prmt.b32 %r9, %r8, %r6, 0x3340U; +; CHECK-NEXT: bfe.u32 %r10, %r4, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs5, %r10; +; CHECK-NEXT: add.s16 %rs6, %rs5, 1; +; CHECK-NEXT: cvt.u32.u16 %r11, %rs6; +; CHECK-NEXT: bfe.u32 %r12, %r4, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs7, %r12; +; CHECK-NEXT: add.s16 %rs8, %rs7, 1; +; CHECK-NEXT: cvt.u32.u16 %r13, %rs8; +; CHECK-NEXT: prmt.b32 %r14, %r13, %r11, 0x3340U; +; CHECK-NEXT: prmt.b32 %r15, %r14, %r9, 0x5410U; +; CHECK-NEXT: bfe.u32 %r16, %r3, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs9, %r16; +; CHECK-NEXT: add.s16 %rs10, %rs9, 1; +; CHECK-NEXT: cvt.u32.u16 %r17, %rs10; +; CHECK-NEXT: bfe.u32 %r18, %r3, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs11, %r18; +; CHECK-NEXT: add.s16 %rs12, %rs11, 1; +; CHECK-NEXT: cvt.u32.u16 %r19, %rs12; +; CHECK-NEXT: prmt.b32 %r20, %r19, %r17, 0x3340U; +; CHECK-NEXT: bfe.u32 %r21, %r3, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs13, %r21; +; CHECK-NEXT: add.s16 %rs14, %rs13, 1; +; CHECK-NEXT: cvt.u32.u16 %r22, %rs14; +; CHECK-NEXT: bfe.u32 %r23, %r3, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs15, %r23; +; CHECK-NEXT: add.s16 %rs16, %rs15, 1; +; CHECK-NEXT: cvt.u32.u16 %r24, %rs16; +; CHECK-NEXT: prmt.b32 %r25, %r24, %r22, 0x3340U; +; CHECK-NEXT: prmt.b32 %r26, %r25, %r20, 0x5410U; +; CHECK-NEXT: bfe.u32 %r27, %r2, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs17, %r27; +; CHECK-NEXT: add.s16 %rs18, %rs17, 1; +; CHECK-NEXT: cvt.u32.u16 %r28, %rs18; +; CHECK-NEXT: bfe.u32 %r29, %r2, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs19, %r29; +; CHECK-NEXT: add.s16 %rs20, %rs19, 1; +; CHECK-NEXT: cvt.u32.u16 %r30, %rs20; +; CHECK-NEXT: prmt.b32 %r31, %r30, %r28, 0x3340U; +; CHECK-NEXT: bfe.u32 %r32, %r2, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs21, %r32; +; CHECK-NEXT: add.s16 %rs22, %rs21, 1; +; CHECK-NEXT: cvt.u32.u16 %r33, %rs22; +; CHECK-NEXT: bfe.u32 %r34, %r2, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs23, %r34; +; CHECK-NEXT: add.s16 %rs24, %rs23, 1; +; CHECK-NEXT: cvt.u32.u16 %r35, %rs24; +; CHECK-NEXT: prmt.b32 %r36, %r35, %r33, 0x3340U; +; CHECK-NEXT: prmt.b32 %r37, %r36, %r31, 0x5410U; +; CHECK-NEXT: bfe.u32 %r38, %r1, 24, 8; +; CHECK-NEXT: cvt.u16.u32 %rs25, %r38; +; CHECK-NEXT: add.s16 %rs26, %rs25, 1; +; CHECK-NEXT: cvt.u32.u16 %r39, %rs26; +; CHECK-NEXT: bfe.u32 %r40, %r1, 16, 8; +; CHECK-NEXT: cvt.u16.u32 %rs27, %r40; +; CHECK-NEXT: add.s16 %rs28, %rs27, 1; +; CHECK-NEXT: cvt.u32.u16 %r41, %rs28; +; CHECK-NEXT: prmt.b32 %r42, %r41, %r39, 0x3340U; +; CHECK-NEXT: bfe.u32 %r43, %r1, 8, 8; +; CHECK-NEXT: cvt.u16.u32 %rs29, %r43; +; CHECK-NEXT: add.s16 %rs30, %rs29, 1; +; CHECK-NEXT: cvt.u32.u16 %r44, %rs30; +; CHECK-NEXT: bfe.u32 %r45, %r1, 0, 8; +; CHECK-NEXT: cvt.u16.u32 %rs31, %r45; +; CHECK-NEXT: add.s16 %rs32, %rs31, 1; +; CHECK-NEXT: cvt.u32.u16 %r46, %rs32; +; CHECK-NEXT: prmt.b32 %r47, %r46, %r44, 0x3340U; +; CHECK-NEXT: prmt.b32 %r48, %r47, %r42, 0x5410U; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r48, %r37, %r26, %r15}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i8>, ptr addrspace(5) %a + %a.add = add <16 x i8> %a.load, + store volatile <16 x i8> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<5>; +; CHECK-NEXT: .reg .b32 %r<3>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi16_param_0]; +; CHECK-NEXT: ld.local.u32 %r1, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r1; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r2, {%rs4, %rs3}; +; CHECK-NEXT: st.local.u32 [%rd1], %r2; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i16>, ptr addrspace(5) %a + %a.add = add <2 x i16> %a.load, + store volatile <2 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi16_param_0]; +; CHECK-NEXT: ld.local.v4.u16 {%rs1, %rs2, %rs3, %rs4}, [%rd1]; +; CHECK-NEXT: add.s16 %rs5, %rs4, 1; +; CHECK-NEXT: add.s16 %rs6, %rs3, 1; +; CHECK-NEXT: add.s16 %rs7, %rs2, 1; +; CHECK-NEXT: add.s16 %rs8, %rs1, 1; +; CHECK-NEXT: st.local.v4.u16 [%rd1], {%rs8, %rs7, %rs6, %rs5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i16>, ptr addrspace(5) %a + %a.add = add <4 x i16> %a.load, + store volatile <4 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_8xi16(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_8xi16( +; CHECK: { +; CHECK-NEXT: .reg .b16 %rs<17>; +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi16_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: mov.b32 {%rs1, %rs2}, %r4; +; CHECK-NEXT: add.s16 %rs3, %rs2, 1; +; CHECK-NEXT: add.s16 %rs4, %rs1, 1; +; CHECK-NEXT: mov.b32 %r5, {%rs4, %rs3}; +; CHECK-NEXT: mov.b32 {%rs5, %rs6}, %r3; +; CHECK-NEXT: add.s16 %rs7, %rs6, 1; +; CHECK-NEXT: add.s16 %rs8, %rs5, 1; +; CHECK-NEXT: mov.b32 %r6, {%rs8, %rs7}; +; CHECK-NEXT: mov.b32 {%rs9, %rs10}, %r2; +; CHECK-NEXT: add.s16 %rs11, %rs10, 1; +; CHECK-NEXT: add.s16 %rs12, %rs9, 1; +; CHECK-NEXT: mov.b32 %r7, {%rs12, %rs11}; +; CHECK-NEXT: mov.b32 {%rs13, %rs14}, %r1; +; CHECK-NEXT: add.s16 %rs15, %rs14, 1; +; CHECK-NEXT: add.s16 %rs16, %rs13, 1; +; CHECK-NEXT: mov.b32 %r8, {%rs16, %rs15}; +; CHECK-NEXT: st.local.v4.b32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i16>, ptr addrspace(5) %a + %a.add = add <8 x i16> %a.load, + store volatile <8 x i16> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi32_param_0]; +; CHECK-NEXT: ld.local.v2.u32 {%r1, %r2}, [%rd1]; +; CHECK-NEXT: add.s32 %r3, %r2, 1; +; CHECK-NEXT: add.s32 %r4, %r1, 1; +; CHECK-NEXT: st.local.v2.u32 [%rd1], {%r4, %r3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i32>, ptr addrspace(5) %a + %a.add = add <2 x i32> %a.load, + store volatile <2 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xi32(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: add.s32 %r5, %r4, 1; +; CHECK-NEXT: add.s32 %r6, %r3, 1; +; CHECK-NEXT: add.s32 %r7, %r2, 1; +; CHECK-NEXT: add.s32 %r8, %r1, 1; +; CHECK-NEXT: st.local.v4.u32 [%rd1], {%r8, %r7, %r6, %r5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i32>, ptr addrspace(5) %a + %a.add = add <4 x i32> %a.load, + store volatile <4 x i32> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xi64(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<6>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: add.s64 %rd4, %rd3, 1; +; CHECK-NEXT: add.s64 %rd5, %rd2, 1; +; CHECK-NEXT: st.local.v2.u64 [%rd1], {%rd5, %rd4}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x i64>, ptr addrspace(5) %a + %a.add = add <2 x i64> %a.load, + store volatile <2 x i64> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<5>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xfloat_param_0]; +; CHECK-NEXT: ld.local.v2.f32 {%f1, %f2}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f3, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f4, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v2.f32 [%rd1], {%f4, %f3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x float>, ptr addrspace(5) %a + %a.add = fadd <2 x float> %a.load, + store volatile <2 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_4xfloat(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_4xfloat( +; CHECK: { +; CHECK-NEXT: .reg .f32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: add.rn.f32 %f5, %f4, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f6, %f3, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f7, %f2, 0f3F800000; +; CHECK-NEXT: add.rn.f32 %f8, %f1, 0f3F800000; +; CHECK-NEXT: st.local.v4.f32 [%rd1], {%f8, %f7, %f6, %f5}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x float>, ptr addrspace(5) %a + %a.add = fadd <4 x float> %a.load, + store volatile <4 x float> %a.add, ptr addrspace(5) %a + ret void +} + +define void @local_volatile_2xdouble(ptr addrspace(5) %a) { +; CHECK-LABEL: local_volatile_2xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<2>; +; CHECK-NEXT: .reg .f64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_2xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: add.rn.f64 %fd3, %fd2, 0d3FF0000000000000; +; CHECK-NEXT: add.rn.f64 %fd4, %fd1, 0d3FF0000000000000; +; CHECK-NEXT: st.local.v2.f64 [%rd1], {%fd4, %fd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <2 x double>, ptr addrspace(5) %a + %a.add = fadd <2 x double> %a.load, + store volatile <2 x double> %a.add, ptr addrspace(5) %a + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll index 6db49b040c0e8..a6a286e608ced 100644 --- a/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll +++ b/llvm/test/CodeGen/NVPTX/shuffle-vec-undef-init.ll @@ -6,19 +6,17 @@ target triple = "nvptx64-unknown-unknown" define void @kernel_func(ptr %in.vec, ptr %out.vec0) nounwind { ; CHECK-LABEL: kernel_func( ; CHECK: { -; CHECK-NEXT: .reg .b32 %r<10>; +; CHECK-NEXT: .reg .b32 %r<14>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: ; CHECK-NEXT: ld.param.u32 %r1, [kernel_func_param_0]; -; CHECK-NEXT: ld.u32 %r2, [%r1+8]; -; CHECK-NEXT: ld.u32 %r3, [%r1]; -; CHECK-NEXT: ld.u32 %r4, [%r1+24]; -; CHECK-NEXT: ld.u32 %r5, [%r1+16]; -; CHECK-NEXT: ld.param.u32 %r6, [kernel_func_param_1]; -; CHECK-NEXT: prmt.b32 %r7, %r5, %r4, 0x4000U; -; CHECK-NEXT: prmt.b32 %r8, %r3, %r2, 0x40U; -; CHECK-NEXT: prmt.b32 %r9, %r8, %r7, 0x7610U; -; CHECK-NEXT: st.u32 [%r6], %r9; +; CHECK-NEXT: ld.v4.b32 {%r2, %r3, %r4, %r5}, [%r1]; +; CHECK-NEXT: ld.v4.b32 {%r6, %r7, %r8, %r9}, [%r1+16]; +; CHECK-NEXT: ld.param.u32 %r10, [kernel_func_param_1]; +; CHECK-NEXT: prmt.b32 %r11, %r6, %r8, 0x4000U; +; CHECK-NEXT: prmt.b32 %r12, %r2, %r4, 0x40U; +; CHECK-NEXT: prmt.b32 %r13, %r12, %r11, 0x7610U; +; CHECK-NEXT: st.u32 [%r10], %r13; ; CHECK-NEXT: ret; %wide.vec = load <32 x i8>, ptr %in.vec, align 64 %vec0 = shufflevector <32 x i8> %wide.vec, <32 x i8> undef, <4 x i32> diff --git a/llvm/test/CodeGen/NVPTX/vector-stores.ll b/llvm/test/CodeGen/NVPTX/vector-stores.ll index 2563d9630e76a..cbcaf5fc3822e 100644 --- a/llvm/test/CodeGen/NVPTX/vector-stores.ll +++ b/llvm/test/CodeGen/NVPTX/vector-stores.ll @@ -31,8 +31,8 @@ define void @foo4(<4 x i32> %val, ptr %ptr) { ; CHECK-LABEL: .visible .func v16i8 define void @v16i8(ptr %a, ptr %b) { -; CHECK: ld.v4.u32 -; CHECK: st.v4.u32 +; CHECK: ld.v4.b32 +; CHECK: st.v4.b32 %v = load <16 x i8>, ptr %a store <16 x i8> %v, ptr %b ret void