From f2d540324eb7a8ac7c2fc136c52b60726a0ff49d Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Fri, 9 May 2025 16:33:58 +0000 Subject: [PATCH 01/11] [NVPTX] Vectorize and lower 256-bit global loads/stores for sm_100+ and ptx88+ --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 3 + llvm/lib/Target/NVPTX/NVPTX.h | 3 +- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 81 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 59 +- llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 2 + llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 33 +- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 10 + llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 3 + .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 7 + .../Target/NVPTX/NVPTXTargetTransformInfo.h | 2 + llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll | 520 ++++++ .../load-store-256-addressing-invariant.ll | 549 +++++++ .../NVPTX/load-store-256-addressing.ll | 543 +++++++ .../CodeGen/NVPTX/load-store-vectors-256.ll | 1442 +++++++++++++++++ .../LoadStoreVectorizer/NVPTX/256-bit.ll | 728 +++++++++ 15 files changed, 3964 insertions(+), 21 deletions(-) create mode 100644 llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll create mode 100644 llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll create mode 100644 llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll create mode 100644 llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll create mode 100644 llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index 0b137250e4e59..ab1c3c19168af 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -319,6 +319,9 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, case NVPTX::PTXLdStInstCode::V4: O << ".v4"; return; + case NVPTX::PTXLdStInstCode::V8: + O << ".v8"; + return; } // TODO: evaluate whether cases not covered by this switch are bugs return; diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 83090ab720c73..2468b8f43ae94 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -199,7 +199,8 @@ enum FromType { enum VecType { Scalar = 1, V2 = 2, - V4 = 4 + V4 = 4, + V8 = 8 }; } // namespace PTXLdStInstCode diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 57971313ba42d..12d0cdb1d486c 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -129,6 +129,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { return; case NVPTXISD::LoadV2: case NVPTXISD::LoadV4: + case NVPTXISD::LoadV8: if (tryLoadVector(N)) return; break; @@ -139,6 +140,7 @@ void NVPTXDAGToDAGISel::Select(SDNode *N) { break; case NVPTXISD::StoreV2: case NVPTXISD::StoreV4: + case NVPTXISD::StoreV8: if (tryStoreVector(N)) return; break; @@ -1170,6 +1172,12 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { FromTypeWidth = TotalWidth / 4; VecType = NVPTX::PTXLdStInstCode::V4; break; + case NVPTXISD::LoadV8: + if (!Subtarget->has256BitMaskedLoadStore()) + return false; + FromTypeWidth = TotalWidth / 8; + VecType = NVPTX::PTXLdStInstCode::V8; + break; default: return false; } @@ -1180,7 +1188,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { } assert(isPowerOf2_32(FromTypeWidth) && FromTypeWidth >= 8 && - FromTypeWidth <= 128 && TotalWidth <= 128 && "Invalid width for load"); + FromTypeWidth <= 128 && TotalWidth <= 256 && "Invalid width for load"); SDValue Offset, Base; SelectADDR(N->getOperand(1), Base, Offset); @@ -1205,9 +1213,22 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { NVPTX::LDV_f32_v2, NVPTX::LDV_f64_v2); break; case NVPTXISD::LoadV4: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4, - NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4, std::nullopt, - NVPTX::LDV_f32_v4, std::nullopt); + Opcode = + pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::LDV_i8_v4, + NVPTX::LDV_i16_v4, NVPTX::LDV_i32_v4, NVPTX::LDV_i64_v4, + NVPTX::LDV_f32_v4, NVPTX::LDV_f64_v4); + break; + case NVPTXISD::LoadV8: + switch (EltVT.getSimpleVT().SimpleTy) { + case MVT::i32: + Opcode = NVPTX::LDV_i32_v8; + break; + case MVT::f32: + Opcode = NVPTX::LDV_f32_v8; + break; + default: + return false; + } break; } if (!Opcode) @@ -1303,7 +1324,8 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE, NVPTX::INT_PTX_LDG_G_v4i16_ELE, NVPTX::INT_PTX_LDG_G_v4i32_ELE, - std::nullopt, NVPTX::INT_PTX_LDG_G_v4f32_ELE, std::nullopt); + NVPTX::INT_PTX_LDG_G_v4i64_ELE, NVPTX::INT_PTX_LDG_G_v4f32_ELE, + NVPTX::INT_PTX_LDG_G_v4f64_ELE); break; case NVPTXISD::LDUV4: Opcode = pickOpcodeForVT( @@ -1311,6 +1333,24 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE, std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt); break; + case NVPTXISD::LoadV8: + switch (EltVT.getSimpleVT().SimpleTy) { + case MVT::i32: + Opcode = NVPTX::INT_PTX_LDG_G_v8i32_ELE; + break; + case MVT::f32: + Opcode = NVPTX::INT_PTX_LDG_G_v8f32_ELE; + break; + case MVT::v2i16: + case MVT::v2f16: + case MVT::v2bf16: + case MVT::v4i8: + Opcode = NVPTX::INT_PTX_LDG_G_v8i32_ELE; + break; + default: + return false; + } + break; } if (!Opcode) return false; @@ -1462,6 +1502,16 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { N2 = N->getOperand(5); ToTypeWidth = TotalWidth / 4; break; + case NVPTXISD::StoreV8: + if (!Subtarget->has256BitMaskedLoadStore()) + return false; + VecType = NVPTX::PTXLdStInstCode::V8; + Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), + N->getOperand(4), N->getOperand(5), N->getOperand(6), + N->getOperand(7), N->getOperand(8)}); + N2 = N->getOperand(9); + ToTypeWidth = TotalWidth / 8; + break; default: return false; } @@ -1471,7 +1521,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { } assert(isPowerOf2_32(ToTypeWidth) && ToTypeWidth >= 8 && ToTypeWidth <= 128 && - TotalWidth <= 128 && "Invalid width for store"); + TotalWidth <= 256 && "Invalid width for store"); SDValue Offset, Base; SelectADDR(N2, Base, Offset); @@ -1492,9 +1542,22 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v2, NVPTX::STV_f64_v2); break; case NVPTXISD::StoreV4: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4, - NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, std::nullopt, - NVPTX::STV_f32_v4, std::nullopt); + Opcode = + pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, NVPTX::STV_i8_v4, + NVPTX::STV_i16_v4, NVPTX::STV_i32_v4, NVPTX::STV_i64_v4, + NVPTX::STV_f32_v4, NVPTX::STV_f64_v4); + break; + case NVPTXISD::StoreV8: + switch (EltVT.getSimpleVT().SimpleTy) { + case MVT::i32: + Opcode = NVPTX::STV_i32_v8; + break; + case MVT::f32: + Opcode = NVPTX::STV_f32_v8; + break; + default: + return false; + } break; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index 3769aae7b620f..d7883b5d526aa 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -162,6 +162,14 @@ static bool IsPTXVectorType(MVT VT) { case MVT::v2f32: case MVT::v4f32: case MVT::v2f64: + case MVT::v4i64: + case MVT::v4f64: + case MVT::v8i32: + case MVT::v8f32: + case MVT::v16f16: // <8 x f16x2> + case MVT::v16bf16: // <8 x bf16x2> + case MVT::v16i16: // <8 x i16x2> + case MVT::v32i8: // <8 x i8x4> return true; } } @@ -179,7 +187,7 @@ static bool Is16bitsType(MVT VT) { // - unsigned int NumElts - The number of elements in the final vector // - EVT EltVT - The type of the elements in the final vector static std::optional> -getVectorLoweringShape(EVT VectorEVT) { +getVectorLoweringShape(EVT VectorEVT, bool CanLowerTo256Bit) { if (!VectorEVT.isSimple()) return std::nullopt; const MVT VectorVT = VectorEVT.getSimpleVT(); @@ -199,6 +207,15 @@ getVectorLoweringShape(EVT VectorEVT) { switch (VectorVT.SimpleTy) { default: return std::nullopt; + case MVT::v4i64: + case MVT::v4f64: + case MVT::v8i32: + case MVT::v8f32: + // This is a "native" vector type iff the address space is global + // and the target supports 256-bit loads/stores + if (!CanLowerTo256Bit) + return std::nullopt; + LLVM_FALLTHROUGH; case MVT::v2i8: case MVT::v2i16: case MVT::v2i32: @@ -215,6 +232,15 @@ getVectorLoweringShape(EVT VectorEVT) { case MVT::v4f32: // This is a "native" vector type return std::pair(NumElts, EltVT); + case MVT::v16f16: // <8 x f16x2> + case MVT::v16bf16: // <8 x bf16x2> + case MVT::v16i16: // <8 x i16x2> + case MVT::v32i8: // <8 x i8x4> + // This can be upsized into a "native" vector type iff the address space is + // global and the target supports 256-bit loads/stores. + if (!CanLowerTo256Bit) + return std::nullopt; + LLVM_FALLTHROUGH; case MVT::v8i8: // <2 x i8x4> case MVT::v8f16: // <4 x f16x2> case MVT::v8bf16: // <4 x bf16x2> @@ -1070,10 +1096,12 @@ const char *NVPTXTargetLowering::getTargetNodeName(unsigned Opcode) const { MAKE_CASE(NVPTXISD::ProxyReg) MAKE_CASE(NVPTXISD::LoadV2) MAKE_CASE(NVPTXISD::LoadV4) + MAKE_CASE(NVPTXISD::LoadV8) MAKE_CASE(NVPTXISD::LDUV2) MAKE_CASE(NVPTXISD::LDUV4) MAKE_CASE(NVPTXISD::StoreV2) MAKE_CASE(NVPTXISD::StoreV4) + MAKE_CASE(NVPTXISD::StoreV8) MAKE_CASE(NVPTXISD::FSHL_CLAMP) MAKE_CASE(NVPTXISD::FSHR_CLAMP) MAKE_CASE(NVPTXISD::BFE) @@ -3201,7 +3229,12 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { if (ValVT != MemVT) return SDValue(); - const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT); + // 256-bit vectors are only allowed iff the address is global + // and the target supports 256-bit loads/stores + unsigned AddrSpace = cast(N)->getAddressSpace(); + bool CanLowerTo256Bit = + AddrSpace == ADDRESS_SPACE_GLOBAL && STI.has256BitMaskedLoadStore(); + const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT, CanLowerTo256Bit); if (!NumEltsAndEltVT) return SDValue(); const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); @@ -3229,6 +3262,9 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { case 4: Opcode = NVPTXISD::StoreV4; break; + case 8: + Opcode = NVPTXISD::StoreV8; + break; } SmallVector Ops; @@ -5765,7 +5801,8 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, - SmallVectorImpl &Results) { + SmallVectorImpl &Results, + bool TargetHas256BitVectorLoadStore) { LoadSDNode *LD = cast(N); const EVT ResVT = LD->getValueType(0); const EVT MemVT = LD->getMemoryVT(); @@ -5775,7 +5812,12 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, if (ResVT != MemVT) return; - const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT); + // 256-bit vectors are only allowed iff the address is global + // and the target supports 256-bit loads/stores + unsigned AddrSpace = cast(N)->getAddressSpace(); + bool CanLowerTo256Bit = + AddrSpace == ADDRESS_SPACE_GLOBAL && TargetHas256BitVectorLoadStore; + const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT, CanLowerTo256Bit); if (!NumEltsAndEltVT) return; const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); @@ -5812,6 +5854,13 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, DAG.getVTList({LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, MVT::Other}); break; } + case 8: { + Opcode = NVPTXISD::LoadV8; + EVT ListVTs[] = {LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, + LoadEltVT, LoadEltVT, LoadEltVT, MVT::Other}; + LdResVTs = DAG.getVTList(ListVTs); + break; + } } SDLoc DL(LD); @@ -6084,7 +6133,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( ReplaceBITCAST(N, DAG, Results); return; case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results); + ReplaceLoadVector(N, DAG, Results, STI.has256BitMaskedLoadStore()); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h index 7a8bf3bf33a94..3dff83d74538b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h @@ -84,10 +84,12 @@ enum NodeType : unsigned { FIRST_MEMORY_OPCODE, LoadV2 = FIRST_MEMORY_OPCODE, LoadV4, + LoadV8, LDUV2, // LDU.v2 LDUV4, // LDU.v4 StoreV2, StoreV4, + StoreV8, LoadParam, LoadParamV2, LoadParamV4, diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 6639554e450f2..1d2074b804f89 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2425,7 +2425,7 @@ let mayStore=1, hasSideEffects=0 in { // The following is used only in and after vector elementizations. Vector // elementization happens at the machine instruction level, so the following // instructions never appear in the DAG. -multiclass LD_VEC { +multiclass LD_VEC { def _v2 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, @@ -2438,17 +2438,27 @@ multiclass LD_VEC { LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; + if support_v8 then { + def _v8 : NVPTXInst< + (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, + regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, ADDR:$addr), + "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, " + "[$addr];", []>; + } } let mayLoad=1, hasSideEffects=0 in { defm LDV_i8 : LD_VEC; defm LDV_i16 : LD_VEC; - defm LDV_i32 : LD_VEC; + defm LDV_i32 : LD_VEC; defm LDV_i64 : LD_VEC; - defm LDV_f32 : LD_VEC; + defm LDV_f32 : LD_VEC; defm LDV_f64 : LD_VEC; } -multiclass ST_VEC { +multiclass ST_VEC { def _v2 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, @@ -2463,14 +2473,25 @@ multiclass ST_VEC { LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; + if support_v8 then { + def _v8 : NVPTXInst< + (outs), + (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, + regclass:$src5, regclass:$src6, regclass:$src7, regclass:$src8, + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + i32imm:$fromWidth, ADDR:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "\t[$addr], " + "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>; + } } let mayStore=1, hasSideEffects=0 in { defm STV_i8 : ST_VEC; defm STV_i16 : ST_VEC; - defm STV_i32 : ST_VEC; + defm STV_i32 : ST_VEC; defm STV_i64 : ST_VEC; - defm STV_f32 : ST_VEC; + defm STV_f32 : ST_VEC; defm STV_f64 : ST_VEC; } diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index 81a864b90c040..eb565e6219d69 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2400,6 +2400,12 @@ class VLDG_G_ELE_V4 : (ins ADDR:$src), "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>; +class VLDG_G_ELE_V8 : + NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, + regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), + (ins ADDR:$src), + "ld.global.nc.v8." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, [$src];", []>; + // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads. def INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"b8", Int16Regs>; def INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"b16", Int16Regs>; @@ -2413,6 +2419,10 @@ def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>; def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>; def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>; +def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"u64", Int64Regs>; +def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"f64", Float64Regs>; +def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"u32", Int32Regs>; +def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"f32", Float32Regs>; multiclass NG_TO_G Preds = []> { if Supports32 then diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 0a4fc8d1435be..5552bba728160 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -72,6 +72,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; + bool has256BitMaskedLoadStore() const { + return SmVersion >= 100 && PTXVersion >= 88; + } bool hasAtomAddF64() const { return SmVersion >= 60; } bool hasAtomScope() const { return SmVersion >= 60; } bool hasAtomBitwise64() const { return SmVersion >= 32; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index 2ada2e464698a..b1484111f1d2d 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -591,6 +591,13 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, return nullptr; } +unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { + // 256 bit loads/stores are currently only supported for global address space + if (AddrSpace == ADDRESS_SPACE_GLOBAL && ST->has256BitMaskedLoadStore()) + return 256; + return 128; +} + unsigned NVPTXTTIImpl::getAssumedAddrSpace(const Value *V) const { if (isa(V)) return ADDRESS_SPACE_LOCAL; diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h index a9bd5a0d01043..98aea4e535f0a 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h @@ -173,6 +173,8 @@ class NVPTXTTIImpl : public BasicTTIImplBase { bool collectFlatAddressOperands(SmallVectorImpl &OpIndexes, Intrinsic::ID IID) const override; + unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const override; + Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV, Value *NewV) const override; unsigned getAssumedAddrSpace(const Value *V) const override; diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll new file mode 100644 index 0000000000000..f4abcb37aa894 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -0,0 +1,520 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90 +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100 +; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; For 256-bit vectors, check that invariant loads from the +; global addrspace are lowered to ld.global.nc. + +define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v32i8( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<16>; +; SM90-NEXT: .reg .b32 %r<19>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v32i8_param_0]; +; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: bfe.u32 %r5, %r4, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs1, %r5; +; SM90-NEXT: bfe.u32 %r6, %r3, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs2, %r6; +; SM90-NEXT: bfe.u32 %r7, %r2, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs3, %r7; +; SM90-NEXT: bfe.u32 %r8, %r1, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs4, %r8; +; SM90-NEXT: ld.global.nc.v4.u32 {%r9, %r10, %r11, %r12}, [%rd1]; +; SM90-NEXT: bfe.u32 %r13, %r12, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs5, %r13; +; SM90-NEXT: bfe.u32 %r14, %r11, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs6, %r14; +; SM90-NEXT: bfe.u32 %r15, %r10, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs7, %r15; +; SM90-NEXT: bfe.u32 %r16, %r9, 0, 8; +; SM90-NEXT: cvt.u16.u32 %rs8, %r16; +; SM90-NEXT: add.s16 %rs9, %rs8, %rs7; +; SM90-NEXT: add.s16 %rs10, %rs6, %rs5; +; SM90-NEXT: add.s16 %rs11, %rs4, %rs3; +; SM90-NEXT: add.s16 %rs12, %rs2, %rs1; +; SM90-NEXT: add.s16 %rs13, %rs9, %rs10; +; SM90-NEXT: add.s16 %rs14, %rs11, %rs12; +; SM90-NEXT: add.s16 %rs15, %rs13, %rs14; +; SM90-NEXT: cvt.u32.u16 %r17, %rs15; +; SM90-NEXT: and.b32 %r18, %r17, 255; +; SM90-NEXT: st.param.b32 [func_retval0], %r18; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v32i8( +; SM100: { +; SM100-NEXT: .reg .b16 %rs<16>; +; SM100-NEXT: .reg .b32 %r<19>; +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v32i8_param_0]; +; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs1, %r9; +; SM100-NEXT: bfe.u32 %r10, %r7, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs2, %r10; +; SM100-NEXT: bfe.u32 %r11, %r6, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs3, %r11; +; SM100-NEXT: bfe.u32 %r12, %r5, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs4, %r12; +; SM100-NEXT: bfe.u32 %r13, %r4, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs5, %r13; +; SM100-NEXT: bfe.u32 %r14, %r3, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs6, %r14; +; SM100-NEXT: bfe.u32 %r15, %r2, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs7, %r15; +; SM100-NEXT: bfe.u32 %r16, %r1, 0, 8; +; SM100-NEXT: cvt.u16.u32 %rs8, %r16; +; SM100-NEXT: add.s16 %rs9, %rs8, %rs7; +; SM100-NEXT: add.s16 %rs10, %rs6, %rs5; +; SM100-NEXT: add.s16 %rs11, %rs4, %rs3; +; SM100-NEXT: add.s16 %rs12, %rs2, %rs1; +; SM100-NEXT: add.s16 %rs13, %rs9, %rs10; +; SM100-NEXT: add.s16 %rs14, %rs11, %rs12; +; SM100-NEXT: add.s16 %rs15, %rs13, %rs14; +; SM100-NEXT: cvt.u32.u16 %r17, %rs15; +; SM100-NEXT: and.b32 %r18, %r17, 255; +; SM100-NEXT: st.param.b32 [func_retval0], %r18; +; SM100-NEXT: ret; + %a = load <32 x i8>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <32 x i8> %a, i32 0 + %v2 = extractelement <32 x i8> %a, i32 4 + %v3 = extractelement <32 x i8> %a, i32 8 + %v4 = extractelement <32 x i8> %a, i32 12 + %v5 = extractelement <32 x i8> %a, i32 16 + %v6 = extractelement <32 x i8> %a, i32 20 + %v7 = extractelement <32 x i8> %a, i32 24 + %v8 = extractelement <32 x i8> %a, i32 28 + %sum1 = add i8 %v1, %v2 + %sum2 = add i8 %v3, %v4 + %sum3 = add i8 %v5, %v6 + %sum4 = add i8 %v7, %v8 + %sum5 = add i8 %sum1, %sum2 + %sum6 = add i8 %sum3, %sum4 + %sum7 = add i8 %sum5, %sum6 + ret i8 %sum7 +} + +define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v16i16( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<16>; +; SM90-NEXT: .reg .b32 %r<10>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v16i16_param_0]; +; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: mov.b32 {%rs1, _}, %r4; +; SM90-NEXT: mov.b32 {%rs2, _}, %r3; +; SM90-NEXT: mov.b32 {%rs3, _}, %r2; +; SM90-NEXT: mov.b32 {%rs4, _}, %r1; +; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: mov.b32 {%rs5, _}, %r8; +; SM90-NEXT: mov.b32 {%rs6, _}, %r7; +; SM90-NEXT: mov.b32 {%rs7, _}, %r6; +; SM90-NEXT: mov.b32 {%rs8, _}, %r5; +; SM90-NEXT: add.s16 %rs9, %rs8, %rs7; +; SM90-NEXT: add.s16 %rs10, %rs6, %rs5; +; SM90-NEXT: add.s16 %rs11, %rs4, %rs3; +; SM90-NEXT: add.s16 %rs12, %rs2, %rs1; +; SM90-NEXT: add.s16 %rs13, %rs9, %rs10; +; SM90-NEXT: add.s16 %rs14, %rs11, %rs12; +; SM90-NEXT: add.s16 %rs15, %rs13, %rs14; +; SM90-NEXT: cvt.u32.u16 %r9, %rs15; +; SM90-NEXT: st.param.b32 [func_retval0], %r9; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v16i16( +; SM100: { +; SM100-NEXT: .reg .b16 %rs<16>; +; SM100-NEXT: .reg .b32 %r<10>; +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v16i16_param_0]; +; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: mov.b32 {%rs1, _}, %r8; +; SM100-NEXT: mov.b32 {%rs2, _}, %r7; +; SM100-NEXT: mov.b32 {%rs3, _}, %r6; +; SM100-NEXT: mov.b32 {%rs4, _}, %r5; +; SM100-NEXT: mov.b32 {%rs5, _}, %r4; +; SM100-NEXT: mov.b32 {%rs6, _}, %r3; +; SM100-NEXT: mov.b32 {%rs7, _}, %r2; +; SM100-NEXT: mov.b32 {%rs8, _}, %r1; +; SM100-NEXT: add.s16 %rs9, %rs8, %rs7; +; SM100-NEXT: add.s16 %rs10, %rs6, %rs5; +; SM100-NEXT: add.s16 %rs11, %rs4, %rs3; +; SM100-NEXT: add.s16 %rs12, %rs2, %rs1; +; SM100-NEXT: add.s16 %rs13, %rs9, %rs10; +; SM100-NEXT: add.s16 %rs14, %rs11, %rs12; +; SM100-NEXT: add.s16 %rs15, %rs13, %rs14; +; SM100-NEXT: cvt.u32.u16 %r9, %rs15; +; SM100-NEXT: st.param.b32 [func_retval0], %r9; +; SM100-NEXT: ret; + %a = load <16 x i16>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <16 x i16> %a, i32 0 + %v2 = extractelement <16 x i16> %a, i32 2 + %v3 = extractelement <16 x i16> %a, i32 4 + %v4 = extractelement <16 x i16> %a, i32 6 + %v5 = extractelement <16 x i16> %a, i32 8 + %v6 = extractelement <16 x i16> %a, i32 10 + %v7 = extractelement <16 x i16> %a, i32 12 + %v8 = extractelement <16 x i16> %a, i32 14 + %sum1 = add i16 %v1, %v2 + %sum2 = add i16 %v3, %v4 + %sum3 = add i16 %v5, %v6 + %sum4 = add i16 %v7, %v8 + %sum5 = add i16 %sum1, %sum2 + %sum6 = add i16 %sum3, %sum4 + %sum7 = add i16 %sum5, %sum6 + ret i16 %sum7 +} + +define half @ld_global_v16f16(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v16f16( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<16>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v16f16_param_0]; +; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: mov.b32 {%rs1, _}, %r4; +; SM90-NEXT: mov.b32 {%rs2, _}, %r3; +; SM90-NEXT: mov.b32 {%rs3, _}, %r2; +; SM90-NEXT: mov.b32 {%rs4, _}, %r1; +; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: mov.b32 {%rs5, _}, %r8; +; SM90-NEXT: mov.b32 {%rs6, _}, %r7; +; SM90-NEXT: mov.b32 {%rs7, _}, %r6; +; SM90-NEXT: mov.b32 {%rs8, _}, %r5; +; SM90-NEXT: add.rn.f16 %rs9, %rs8, %rs7; +; SM90-NEXT: add.rn.f16 %rs10, %rs6, %rs5; +; SM90-NEXT: add.rn.f16 %rs11, %rs4, %rs3; +; SM90-NEXT: add.rn.f16 %rs12, %rs2, %rs1; +; SM90-NEXT: add.rn.f16 %rs13, %rs9, %rs10; +; SM90-NEXT: add.rn.f16 %rs14, %rs11, %rs12; +; SM90-NEXT: add.rn.f16 %rs15, %rs13, %rs14; +; SM90-NEXT: st.param.b16 [func_retval0], %rs15; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v16f16( +; SM100: { +; SM100-NEXT: .reg .b16 %rs<16>; +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v16f16_param_0]; +; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: mov.b32 {%rs1, _}, %r8; +; SM100-NEXT: mov.b32 {%rs2, _}, %r7; +; SM100-NEXT: mov.b32 {%rs3, _}, %r6; +; SM100-NEXT: mov.b32 {%rs4, _}, %r5; +; SM100-NEXT: mov.b32 {%rs5, _}, %r4; +; SM100-NEXT: mov.b32 {%rs6, _}, %r3; +; SM100-NEXT: mov.b32 {%rs7, _}, %r2; +; SM100-NEXT: mov.b32 {%rs8, _}, %r1; +; SM100-NEXT: add.rn.f16 %rs9, %rs8, %rs7; +; SM100-NEXT: add.rn.f16 %rs10, %rs6, %rs5; +; SM100-NEXT: add.rn.f16 %rs11, %rs4, %rs3; +; SM100-NEXT: add.rn.f16 %rs12, %rs2, %rs1; +; SM100-NEXT: add.rn.f16 %rs13, %rs9, %rs10; +; SM100-NEXT: add.rn.f16 %rs14, %rs11, %rs12; +; SM100-NEXT: add.rn.f16 %rs15, %rs13, %rs14; +; SM100-NEXT: st.param.b16 [func_retval0], %rs15; +; SM100-NEXT: ret; + %a = load <16 x half>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <16 x half> %a, i32 0 + %v2 = extractelement <16 x half> %a, i32 2 + %v3 = extractelement <16 x half> %a, i32 4 + %v4 = extractelement <16 x half> %a, i32 6 + %v5 = extractelement <16 x half> %a, i32 8 + %v6 = extractelement <16 x half> %a, i32 10 + %v7 = extractelement <16 x half> %a, i32 12 + %v8 = extractelement <16 x half> %a, i32 14 + %sum1 = fadd half %v1, %v2 + %sum2 = fadd half %v3, %v4 + %sum3 = fadd half %v5, %v6 + %sum4 = fadd half %v7, %v8 + %sum5 = fadd half %sum1, %sum2 + %sum6 = fadd half %sum3, %sum4 + %sum7 = fadd half %sum5, %sum6 + ret half %sum7 +} + +define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v16bf16( +; SM90: { +; SM90-NEXT: .reg .b16 %rs<16>; +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v16bf16_param_0]; +; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: mov.b32 {%rs1, _}, %r4; +; SM90-NEXT: mov.b32 {%rs2, _}, %r3; +; SM90-NEXT: mov.b32 {%rs3, _}, %r2; +; SM90-NEXT: mov.b32 {%rs4, _}, %r1; +; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: mov.b32 {%rs5, _}, %r8; +; SM90-NEXT: mov.b32 {%rs6, _}, %r7; +; SM90-NEXT: mov.b32 {%rs7, _}, %r6; +; SM90-NEXT: mov.b32 {%rs8, _}, %r5; +; SM90-NEXT: add.rn.bf16 %rs9, %rs8, %rs7; +; SM90-NEXT: add.rn.bf16 %rs10, %rs6, %rs5; +; SM90-NEXT: add.rn.bf16 %rs11, %rs4, %rs3; +; SM90-NEXT: add.rn.bf16 %rs12, %rs2, %rs1; +; SM90-NEXT: add.rn.bf16 %rs13, %rs9, %rs10; +; SM90-NEXT: add.rn.bf16 %rs14, %rs11, %rs12; +; SM90-NEXT: add.rn.bf16 %rs15, %rs13, %rs14; +; SM90-NEXT: st.param.b16 [func_retval0], %rs15; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v16bf16( +; SM100: { +; SM100-NEXT: .reg .b16 %rs<16>; +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v16bf16_param_0]; +; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: mov.b32 {%rs1, _}, %r8; +; SM100-NEXT: mov.b32 {%rs2, _}, %r7; +; SM100-NEXT: mov.b32 {%rs3, _}, %r6; +; SM100-NEXT: mov.b32 {%rs4, _}, %r5; +; SM100-NEXT: mov.b32 {%rs5, _}, %r4; +; SM100-NEXT: mov.b32 {%rs6, _}, %r3; +; SM100-NEXT: mov.b32 {%rs7, _}, %r2; +; SM100-NEXT: mov.b32 {%rs8, _}, %r1; +; SM100-NEXT: add.rn.bf16 %rs9, %rs8, %rs7; +; SM100-NEXT: add.rn.bf16 %rs10, %rs6, %rs5; +; SM100-NEXT: add.rn.bf16 %rs11, %rs4, %rs3; +; SM100-NEXT: add.rn.bf16 %rs12, %rs2, %rs1; +; SM100-NEXT: add.rn.bf16 %rs13, %rs9, %rs10; +; SM100-NEXT: add.rn.bf16 %rs14, %rs11, %rs12; +; SM100-NEXT: add.rn.bf16 %rs15, %rs13, %rs14; +; SM100-NEXT: st.param.b16 [func_retval0], %rs15; +; SM100-NEXT: ret; + %a = load <16 x bfloat>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <16 x bfloat> %a, i32 0 + %v2 = extractelement <16 x bfloat> %a, i32 2 + %v3 = extractelement <16 x bfloat> %a, i32 4 + %v4 = extractelement <16 x bfloat> %a, i32 6 + %v5 = extractelement <16 x bfloat> %a, i32 8 + %v6 = extractelement <16 x bfloat> %a, i32 10 + %v7 = extractelement <16 x bfloat> %a, i32 12 + %v8 = extractelement <16 x bfloat> %a, i32 14 + %sum1 = fadd bfloat %v1, %v2 + %sum2 = fadd bfloat %v3, %v4 + %sum3 = fadd bfloat %v5, %v6 + %sum4 = fadd bfloat %v7, %v8 + %sum5 = fadd bfloat %sum1, %sum2 + %sum6 = fadd bfloat %sum3, %sum4 + %sum7 = fadd bfloat %sum5, %sum6 + ret bfloat %sum7 +} + +define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v8i32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<16>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v8i32_param_0]; +; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: add.s32 %r9, %r5, %r6; +; SM90-NEXT: add.s32 %r10, %r7, %r8; +; SM90-NEXT: add.s32 %r11, %r1, %r2; +; SM90-NEXT: add.s32 %r12, %r3, %r4; +; SM90-NEXT: add.s32 %r13, %r9, %r10; +; SM90-NEXT: add.s32 %r14, %r11, %r12; +; SM90-NEXT: add.s32 %r15, %r13, %r14; +; SM90-NEXT: st.param.b32 [func_retval0], %r15; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v8i32( +; SM100: { +; SM100-NEXT: .reg .b32 %r<16>; +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v8i32_param_0]; +; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: add.s32 %r9, %r1, %r2; +; SM100-NEXT: add.s32 %r10, %r3, %r4; +; SM100-NEXT: add.s32 %r11, %r5, %r6; +; SM100-NEXT: add.s32 %r12, %r7, %r8; +; SM100-NEXT: add.s32 %r13, %r9, %r10; +; SM100-NEXT: add.s32 %r14, %r11, %r12; +; SM100-NEXT: add.s32 %r15, %r13, %r14; +; SM100-NEXT: st.param.b32 [func_retval0], %r15; +; SM100-NEXT: ret; + %a = load <8 x i32>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <8 x i32> %a, i32 0 + %v2 = extractelement <8 x i32> %a, i32 1 + %v3 = extractelement <8 x i32> %a, i32 2 + %v4 = extractelement <8 x i32> %a, i32 3 + %v5 = extractelement <8 x i32> %a, i32 4 + %v6 = extractelement <8 x i32> %a, i32 5 + %v7 = extractelement <8 x i32> %a, i32 6 + %v8 = extractelement <8 x i32> %a, i32 7 + %sum1 = add i32 %v1, %v2 + %sum2 = add i32 %v3, %v4 + %sum3 = add i32 %v5, %v6 + %sum4 = add i32 %v7, %v8 + %sum5 = add i32 %sum1, %sum2 + %sum6 = add i32 %sum3, %sum4 + %sum7 = add i32 %sum5, %sum6 + + ret i32 %sum7 +} + +define float @ld_global_v8f32(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v8f32( +; SM90: { +; SM90-NEXT: .reg .b32 %f<16>; +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v8f32_param_0]; +; SM90-NEXT: ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1]; +; SM90-NEXT: add.rn.f32 %f9, %f5, %f6; +; SM90-NEXT: add.rn.f32 %f10, %f7, %f8; +; SM90-NEXT: add.rn.f32 %f11, %f1, %f2; +; SM90-NEXT: add.rn.f32 %f12, %f3, %f4; +; SM90-NEXT: add.rn.f32 %f13, %f9, %f10; +; SM90-NEXT: add.rn.f32 %f14, %f11, %f12; +; SM90-NEXT: add.rn.f32 %f15, %f13, %f14; +; SM90-NEXT: st.param.f32 [func_retval0], %f15; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v8f32( +; SM100: { +; SM100-NEXT: .reg .b32 %f<16>; +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v8f32_param_0]; +; SM100-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; SM100-NEXT: add.rn.f32 %f9, %f1, %f2; +; SM100-NEXT: add.rn.f32 %f10, %f3, %f4; +; SM100-NEXT: add.rn.f32 %f11, %f5, %f6; +; SM100-NEXT: add.rn.f32 %f12, %f7, %f8; +; SM100-NEXT: add.rn.f32 %f13, %f9, %f10; +; SM100-NEXT: add.rn.f32 %f14, %f11, %f12; +; SM100-NEXT: add.rn.f32 %f15, %f13, %f14; +; SM100-NEXT: st.param.f32 [func_retval0], %f15; +; SM100-NEXT: ret; + %a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <8 x float> %a, i32 0 + %v2 = extractelement <8 x float> %a, i32 1 + %v3 = extractelement <8 x float> %a, i32 2 + %v4 = extractelement <8 x float> %a, i32 3 + %v5 = extractelement <8 x float> %a, i32 4 + %v6 = extractelement <8 x float> %a, i32 5 + %v7 = extractelement <8 x float> %a, i32 6 + %v8 = extractelement <8 x float> %a, i32 7 + %sum1 = fadd float %v1, %v2 + %sum2 = fadd float %v3, %v4 + %sum3 = fadd float %v5, %v6 + %sum4 = fadd float %v7, %v8 + %sum5 = fadd float %sum1, %sum2 + %sum6 = fadd float %sum3, %sum4 + %sum7 = fadd float %sum5, %sum6 + + ret float %sum7 +} + +define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v4i64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<9>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v4i64_param_0]; +; SM90-NEXT: ld.global.nc.v2.u64 {%rd2, %rd3}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v2.u64 {%rd4, %rd5}, [%rd1]; +; SM90-NEXT: add.s64 %rd6, %rd4, %rd5; +; SM90-NEXT: add.s64 %rd7, %rd2, %rd3; +; SM90-NEXT: add.s64 %rd8, %rd6, %rd7; +; SM90-NEXT: st.param.b64 [func_retval0], %rd8; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v4i64( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<9>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v4i64_param_0]; +; SM100-NEXT: ld.global.nc.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: add.s64 %rd6, %rd2, %rd3; +; SM100-NEXT: add.s64 %rd7, %rd4, %rd5; +; SM100-NEXT: add.s64 %rd8, %rd6, %rd7; +; SM100-NEXT: st.param.b64 [func_retval0], %rd8; +; SM100-NEXT: ret; + %a = load <4 x i64>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <4 x i64> %a, i32 0 + %v2 = extractelement <4 x i64> %a, i32 1 + %v3 = extractelement <4 x i64> %a, i32 2 + %v4 = extractelement <4 x i64> %a, i32 3 + %sum1 = add i64 %v1, %v2 + %sum2 = add i64 %v3, %v4 + %sum3 = add i64 %sum1, %sum2 + ret i64 %sum3 +} + +define double @ld_global_v4f64(ptr addrspace(1) %ptr) { +; SM90-LABEL: ld_global_v4f64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<2>; +; SM90-NEXT: .reg .b64 %fd<8>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v4f64_param_0]; +; SM90-NEXT: ld.global.nc.v2.f64 {%fd1, %fd2}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v2.f64 {%fd3, %fd4}, [%rd1]; +; SM90-NEXT: add.rn.f64 %fd5, %fd3, %fd4; +; SM90-NEXT: add.rn.f64 %fd6, %fd1, %fd2; +; SM90-NEXT: add.rn.f64 %fd7, %fd5, %fd6; +; SM90-NEXT: st.param.f64 [func_retval0], %fd7; +; SM90-NEXT: ret; +; +; SM100-LABEL: ld_global_v4f64( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<2>; +; SM100-NEXT: .reg .b64 %fd<8>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v4f64_param_0]; +; SM100-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; SM100-NEXT: add.rn.f64 %fd5, %fd1, %fd2; +; SM100-NEXT: add.rn.f64 %fd6, %fd3, %fd4; +; SM100-NEXT: add.rn.f64 %fd7, %fd5, %fd6; +; SM100-NEXT: st.param.f64 [func_retval0], %fd7; +; SM100-NEXT: ret; + %a = load <4 x double>, ptr addrspace(1) %ptr, !invariant.load !0 + %v1 = extractelement <4 x double> %a, i32 0 + %v2 = extractelement <4 x double> %a, i32 1 + %v3 = extractelement <4 x double> %a, i32 2 + %v4 = extractelement <4 x double> %a, i32 3 + %sum1 = fadd double %v1, %v2 + %sum2 = fadd double %v3, %v4 + %sum3 = fadd double %sum1, %sum2 + ret double %sum3 +} + +!0 = !{} diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll new file mode 100644 index 0000000000000..a0bfbef53020f --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -0,0 +1,549 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; In this test, we check that all the addressing modes are lowered correctly +; for 256-bit invariant loads, which get lowered to ld.global.nc +; addr can be any of the following: +; - avar : direct address +; - asi: direct address + offset +; - areg_64: 64-bit register +; - ari_64: 64-bit register + offset +; Since this is a blackwell+ feature, +; and support for 32-bit addressing does not exist after sm_90, +; the "areg" and "ari" 32-bit addressing modes are not tested or supported. + +; For invariant loads, asi is historically not supported, +; and instead it is selected as move into register, add of offset, and loaded as areg64 + +; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double + +; Global is the only address space that currently supports 256-bit loads/stores + +@globalin = external addrspace(1) global ptr +@globalout = external addrspace(1) global ptr + +define void @avar_i8() { +; PTX-LABEL: avar_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <32 x i8>, ptr addrspace(1) @globalin, !invariant.load !0 + store <32 x i8> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_i16() { +; PTX-LABEL: avar_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x i16>, ptr addrspace(1) @globalin, !invariant.load !0 + store <16 x i16> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_half() { +; PTX-LABEL: avar_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x half>, ptr addrspace(1) @globalin, !invariant.load !0 + store <16 x half> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_bfloat() { +; PTX-LABEL: avar_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x bfloat>, ptr addrspace(1) @globalin, !invariant.load !0 + store <16 x bfloat> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_i32() { +; PTX-LABEL: avar_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.u32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0 + store <8 x i32> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_i64() { +; PTX-LABEL: avar_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.u64 [globalout], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: ret; + %load = load <4 x i64>, ptr addrspace(1) @globalin, !invariant.load !0 + store <4 x i64> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_float() { +; PTX-LABEL: avar_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; +; PTX-NEXT: st.global.v8.f32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0 + store <8 x float> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_double() { +; PTX-LABEL: avar_double( +; PTX: { +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; +; PTX-NEXT: st.global.v4.f64 [globalout], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0 + store <4 x double> %load, ptr addrspace(1) @globalout + ret void +} + +define void @asi_i8() { +; PTX-LABEL: asi_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <32 x i8> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_i16() { +; PTX-LABEL: asi_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <16 x i16> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_half() { +; PTX-LABEL: asi_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <16 x half> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_bfloat() { +; PTX-LABEL: asi_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <16 x bfloat> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_i32() { +; PTX-LABEL: asi_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.u32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <8 x i32> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_i64() { +; PTX-LABEL: asi_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.u64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <4 x i64> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_float() { +; PTX-LABEL: asi_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; +; PTX-NEXT: st.global.v8.f32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <8 x float> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_double() { +; PTX-LABEL: asi_double( +; PTX: { +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.f64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <4 x double> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i8_param_0]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i8_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <32 x i8>, ptr addrspace(1) %in, !invariant.load !0 + store <32 x i8> %load, ptr addrspace(1) %out + ret void +} +define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i16_param_0]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i16_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x i16>, ptr addrspace(1) %in, !invariant.load !0 + store <16 x i16> %load, ptr addrspace(1) %out + ret void +} +define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_half_param_0]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_half_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x half>, ptr addrspace(1) %in, !invariant.load !0 + store <16 x half> %load, ptr addrspace(1) %out + ret void +} +define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_bfloat_param_0]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_bfloat_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x bfloat>, ptr addrspace(1) %in, !invariant.load !0 + store <16 x bfloat> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i32_param_0]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0 + store <8 x i32> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i64_param_0]; +; PTX-NEXT: ld.global.nc.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd6, [areg_64_i64_param_1]; +; PTX-NEXT: st.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; PTX-NEXT: ret; + %load = load <4 x i64>, ptr addrspace(1) %in, !invariant.load !0 + store <4 x i64> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_float_param_0]; +; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_float_param_1]; +; PTX-NEXT: st.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0 + store <8 x float> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_double( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_double_param_0]; +; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_double_param_1]; +; PTX-NEXT: st.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %load = load <4 x double>, ptr addrspace(1) %in, !invariant.load !0 + store <4 x double> %load, ptr addrspace(1) %out + ret void +} + +define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i8_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i8_param_1]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <32 x i8>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <32 x i8> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i16_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i16_param_1]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <16 x i16>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <16 x i16> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_half_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_half_param_1]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <16 x half>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <16 x half> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_bfloat_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_bfloat_param_1]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <16 x bfloat>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <16 x bfloat> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i32_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i32_param_1]; +; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.u32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <8 x i32> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i64_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i64_param_1]; +; PTX-NEXT: ld.global.nc.v4.u64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.u64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <4 x i64> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_float_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_float_param_1]; +; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.f32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <8 x float> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_double( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_double_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_double_param_1]; +; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; +; PTX-NEXT: st.global.v4.f64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0 + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <4 x double> %load, ptr addrspace(1) %out.offset + ret void +} + +!0 = !{} diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll new file mode 100644 index 0000000000000..55b71ccfac5a2 --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -0,0 +1,543 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; In this test, we check that all the addressing modes are lowered correctly, +; addr can be any of the following: +; - avar : direct address +; - asi: direct address + offset +; - areg_64: 64-bit register +; - ari_64: 64-bit register + offset +; Since this is a blackwell+ feature, +; and support for 32-bit addressing does not exist after sm_90, +; the "areg" and "ari" 32-bit addressing modes are not tested or supported. + +; Checks 8 types: i8, i16, bfloat, half, i32, i64, float, double + +; Global is the only address space that currently supports 256-bit loads/stores + +@globalin = external addrspace(1) global ptr +@globalout = external addrspace(1) global ptr + +define void @avar_i8() { +; PTX-LABEL: avar_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <32 x i8>, ptr addrspace(1) @globalin + store <32 x i8> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_i16() { +; PTX-LABEL: avar_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x i16>, ptr addrspace(1) @globalin + store <16 x i16> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_half() { +; PTX-LABEL: avar_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x half>, ptr addrspace(1) @globalin + store <16 x half> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_bfloat() { +; PTX-LABEL: avar_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x bfloat>, ptr addrspace(1) @globalin + store <16 x bfloat> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_i32() { +; PTX-LABEL: avar_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.u32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <8 x i32>, ptr addrspace(1) @globalin + store <8 x i32> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_i64() { +; PTX-LABEL: avar_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.u64 [globalout], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: ret; + %load = load <4 x i64>, ptr addrspace(1) @globalin + store <4 x i64> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_float() { +; PTX-LABEL: avar_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; +; PTX-NEXT: st.global.v8.f32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %load = load <8 x float>, ptr addrspace(1) @globalin + store <8 x float> %load, ptr addrspace(1) @globalout + ret void +} + +define void @avar_double() { +; PTX-LABEL: avar_double( +; PTX: { +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; +; PTX-NEXT: st.global.v4.f64 [globalout], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %load = load <4 x double>, ptr addrspace(1) @globalin + store <4 x double> %load, ptr addrspace(1) @globalout + ret void +} + +define void @asi_i8() { +; PTX-LABEL: asi_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <32 x i8>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <32 x i8> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_i16() { +; PTX-LABEL: asi_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <16 x i16>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <16 x i16> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_half() { +; PTX-LABEL: asi_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <16 x half>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <16 x half> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_bfloat() { +; PTX-LABEL: asi_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <16 x bfloat>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <16 x bfloat> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_i32() { +; PTX-LABEL: asi_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.u32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <8 x i32>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <8 x i32> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_i64() { +; PTX-LABEL: asi_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.u64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <4 x i64>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <4 x i64> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_float() { +; PTX-LABEL: asi_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; +; PTX-NEXT: st.global.v8.f32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <8 x float>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <8 x float> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @asi_double() { +; PTX-LABEL: asi_double( +; PTX: { +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.f64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 + %load = load <4 x double>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) @globalout, i32 32 + store <4 x double> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i8_param_0]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i8_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <32 x i8>, ptr addrspace(1) %in + store <32 x i8> %load, ptr addrspace(1) %out + ret void +} +define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i16_param_0]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i16_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x i16>, ptr addrspace(1) %in + store <16 x i16> %load, ptr addrspace(1) %out + ret void +} +define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_half_param_0]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_half_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x half>, ptr addrspace(1) %in + store <16 x half> %load, ptr addrspace(1) %out + ret void +} +define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_bfloat_param_0]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_bfloat_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <16 x bfloat>, ptr addrspace(1) %in + store <16 x bfloat> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i32_param_0]; +; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %load = load <8 x i32>, ptr addrspace(1) %in + store <8 x i32> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i64_param_0]; +; PTX-NEXT: ld.global.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd6, [areg_64_i64_param_1]; +; PTX-NEXT: st.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; PTX-NEXT: ret; + %load = load <4 x i64>, ptr addrspace(1) %in + store <4 x i64> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_float_param_0]; +; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_float_param_1]; +; PTX-NEXT: st.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %load = load <8 x float>, ptr addrspace(1) %in + store <8 x float> %load, ptr addrspace(1) %out + ret void +} + +define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: areg_64_double( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [areg_64_double_param_0]; +; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; PTX-NEXT: ld.param.u64 %rd2, [areg_64_double_param_1]; +; PTX-NEXT: st.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %load = load <4 x double>, ptr addrspace(1) %in + store <4 x double> %load, ptr addrspace(1) %out + ret void +} + +define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i8( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i8_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i8_param_1]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <32 x i8>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <32 x i8> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i16( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i16_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i16_param_1]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <16 x i16>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <16 x i16> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_half( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_half_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_half_param_1]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <16 x half>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <16 x half> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_bfloat( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_bfloat_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_bfloat_param_1]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <16 x bfloat>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <16 x bfloat> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i32( +; PTX: { +; PTX-NEXT: .reg .b32 %r<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i32_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i32_param_1]; +; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.u32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <8 x i32>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <8 x i32> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_i64( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<7>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i64_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i64_param_1]; +; PTX-NEXT: ld.global.v4.u64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.u64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <4 x i64>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <4 x i64> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_float( +; PTX: { +; PTX-NEXT: .reg .b32 %f<9>; +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_float_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_float_param_1]; +; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.f32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <8 x float>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <8 x float> %load, ptr addrspace(1) %out.offset + ret void +} + +define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { +; PTX-LABEL: ari_64_double( +; PTX: { +; PTX-NEXT: .reg .b64 %rd<3>; +; PTX-NEXT: .reg .b64 %fd<5>; +; PTX-EMPTY: +; PTX-NEXT: // %bb.0: +; PTX-NEXT: ld.param.u64 %rd1, [ari_64_double_param_0]; +; PTX-NEXT: ld.param.u64 %rd2, [ari_64_double_param_1]; +; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; +; PTX-NEXT: st.global.v4.f64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ret; + %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 + %load = load <4 x double>, ptr addrspace(1) %in.offset + %out.offset = getelementptr inbounds i8, ptr addrspace(1) %out, i32 32 + store <4 x double> %load, ptr addrspace(1) %out.offset + ret void +} diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll new file mode 100644 index 0000000000000..e26a1a5617d2d --- /dev/null +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -0,0 +1,1442 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck -check-prefixes=CHECK,SM90 %s +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100 +; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} + +; This test is based on load-store-vectors.ll, +; and contains testing for lowering 256-bit vector loads/stores + +; Types we are checking: i8, i16, half, bfloat, i32, i64, f32, f64 + +; Address spaces we are checking: generic, global, shared, local +; - Global is the only address space that currently supports 256-bit/v8 loads/stores, +; the other cases will legalize by splitting to smaller vectors + +; 256-bit vector loads/stores are only legal for blackwell+, so on sm_90, the vectors will be split + +; Types of loads/stores we are checking: normal, volatile +; - No need to check atomic loads/stores (monotonic and unordered) like load-store-vectors.ll checks, +; because those only work with non-vectors + +;; generic statespace + +; generic + +define void @generic_32xi8(ptr %a, ptr %b) { +; CHECK-LABEL: generic_32xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_32xi8_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_32xi8_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <32 x i8>, ptr %a + store <32 x i8> %a.load, ptr %b + ret void +} + +define void @generic_16xi16(ptr %a, ptr %b) { +; CHECK-LABEL: generic_16xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi16_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_16xi16_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x i16>, ptr %a + store <16 x i16> %a.load, ptr %b + ret void +} + +define void @generic_16xhalf(ptr %a, ptr %b) { +; CHECK-LABEL: generic_16xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xhalf_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_16xhalf_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x half>, ptr %a + store <16 x half> %a.load, ptr %b + ret void +} + +define void @generic_16xbfloat(ptr %a, ptr %b) { +; CHECK-LABEL: generic_16xbfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xbfloat_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_16xbfloat_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x bfloat>, ptr %a + store <16 x bfloat> %a.load, ptr %b + ret void +} + +define void @generic_8xi32(ptr %a, ptr %b) { +; CHECK-LABEL: generic_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi32_param_0]; +; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_8xi32_param_1]; +; CHECK-NEXT: st.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <8 x i32>, ptr %a + store <8 x i32> %a.load, ptr %b + ret void +} + +define void @generic_4xi64(ptr %a, ptr %b) { +; CHECK-LABEL: generic_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi64_param_0]; +; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd6, [generic_4xi64_param_1]; +; CHECK-NEXT: st.v2.u64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load <4 x i64>, ptr %a + store <4 x i64> %a.load, ptr %b + ret void +} + +define void @generic_8xfloat(ptr %a, ptr %b) { +; CHECK-LABEL: generic_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xfloat_param_0]; +; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_8xfloat_param_1]; +; CHECK-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ret; + %a.load = load <8 x float>, ptr %a + store <8 x float> %a.load, ptr %b + ret void +} + +define void @generic_4xdouble(ptr %a, ptr %b) { +; CHECK-LABEL: generic_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xdouble_param_0]; +; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_4xdouble_param_1]; +; CHECK-NEXT: st.v2.f64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ret; + %a.load = load <4 x double>, ptr %a + store <4 x double> %a.load, ptr %b + ret void +} + +; generic_volatile + +define void @generic_volatile_32xi8(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_32xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_32xi8_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_32xi8_param_1]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <32 x i8>, ptr %a + store volatile <32 x i8> %a.load, ptr %b + ret void +} + +define void @generic_volatile_16xi16(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_16xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi16_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_16xi16_param_1]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i16>, ptr %a + store volatile <16 x i16> %a.load, ptr %b + ret void +} + +define void @generic_volatile_16xhalf(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_16xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xhalf_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_16xhalf_param_1]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x half>, ptr %a + store volatile <16 x half> %a.load, ptr %b + ret void +} + +define void @generic_volatile_16xbfloat(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_16xbfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xbfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_16xbfloat_param_1]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x bfloat>, ptr %a + store volatile <16 x bfloat> %a.load, ptr %b + ret void +} + +define void @generic_volatile_8xi32(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi32_param_0]; +; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_8xi32_param_1]; +; CHECK-NEXT: st.volatile.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i32>, ptr %a + store volatile <8 x i32> %a.load, ptr %b + ret void +} + +define void @generic_volatile_4xi64(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi64_param_0]; +; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.volatile.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd6, [generic_volatile_4xi64_param_1]; +; CHECK-NEXT: st.volatile.v2.u64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.volatile.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i64>, ptr %a + store volatile <4 x i64> %a.load, ptr %b + ret void +} + +define void @generic_volatile_8xfloat(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.volatile.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.volatile.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x float>, ptr %a + store volatile <8 x float> %a.load, ptr %b + ret void +} + +define void @generic_volatile_4xdouble(ptr %a, ptr %b) { +; CHECK-LABEL: generic_volatile_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xdouble_param_0]; +; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.volatile.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_4xdouble_param_1]; +; CHECK-NEXT: st.volatile.v2.f64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.volatile.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x double>, ptr %a + store volatile <4 x double> %a.load, ptr %b + ret void +} + +;; global statespace + +; global + +define void @global_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_32xi8( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_32xi8_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_32xi8_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_32xi8( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_32xi8_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_32xi8_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load <32 x i8>, ptr addrspace(1) %a + store <32 x i8> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_16xi16( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_16xi16_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_16xi16_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_16xi16( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_16xi16_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_16xi16_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load <16 x i16>, ptr addrspace(1) %a + store <16 x i16> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_16xhalf( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_16xhalf_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_16xhalf_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_16xhalf( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_16xhalf_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_16xhalf_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load <16 x half>, ptr addrspace(1) %a + store <16 x half> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_16xbfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_16xbfloat_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_16xbfloat_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_16xbfloat( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_16xbfloat_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_16xbfloat_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load <16 x bfloat>, ptr addrspace(1) %a + store <16 x bfloat> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_8xi32_param_0]; +; SM90-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_8xi32_param_1]; +; SM90-NEXT: st.global.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xi32( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_8xi32_param_0]; +; SM100-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_8xi32_param_1]; +; SM100-NEXT: st.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load <8 x i32>, ptr addrspace(1) %a + store <8 x i32> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_4xi64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_4xi64_param_0]; +; SM90-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.global.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd6, [global_4xi64_param_1]; +; SM90-NEXT: st.global.v2.u64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.global.v2.u64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_4xi64( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_4xi64_param_0]; +; SM100-NEXT: ld.global.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd6, [global_4xi64_param_1]; +; SM100-NEXT: st.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load <4 x i64>, ptr addrspace(1) %a + store <4 x i64> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %f<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_8xfloat_param_0]; +; SM90-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; SM90-NEXT: ld.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_8xfloat_param_1]; +; SM90-NEXT: st.global.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; SM90-NEXT: st.global.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b32 %f<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_8xfloat_param_0]; +; SM100-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_8xfloat_param_1]; +; SM100-NEXT: st.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; SM100-NEXT: ret; + %a.load = load <8 x float>, ptr addrspace(1) %a + store <8 x float> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_4xdouble( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b64 %fd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_4xdouble_param_0]; +; SM90-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; SM90-NEXT: ld.global.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_4xdouble_param_1]; +; SM90-NEXT: st.global.v2.f64 [%rd2+16], {%fd3, %fd4}; +; SM90-NEXT: st.global.v2.f64 [%rd2], {%fd1, %fd2}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_4xdouble( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %fd<5>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_4xdouble_param_0]; +; SM100-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_4xdouble_param_1]; +; SM100-NEXT: st.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; SM100-NEXT: ret; + %a.load = load <4 x double>, ptr addrspace(1) %a + store <4 x double> %a.load, ptr addrspace(1) %b + ret void +} + +; global_volatile + +define void @global_volatile_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_32xi8( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_32xi8_param_0]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_32xi8_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_32xi8( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_32xi8_param_0]; +; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_32xi8_param_1]; +; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load volatile <32 x i8>, ptr addrspace(1) %a + store volatile <32 x i8> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_16xi16( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_16xi16_param_0]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_16xi16_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_16xi16( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_16xi16_param_0]; +; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_16xi16_param_1]; +; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load volatile <16 x i16>, ptr addrspace(1) %a + store volatile <16 x i16> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_16xhalf( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_16xhalf_param_0]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_16xhalf_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_16xhalf( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_16xhalf_param_0]; +; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_16xhalf_param_1]; +; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load volatile <16 x half>, ptr addrspace(1) %a + store volatile <16 x half> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_16xbfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_16xbfloat_param_0]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_16xbfloat_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_16xbfloat( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_16xbfloat_param_0]; +; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_16xbfloat_param_1]; +; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load volatile <16 x bfloat>, ptr addrspace(1) %a + store volatile <16 x bfloat> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_8xi32( +; SM90: { +; SM90-NEXT: .reg .b32 %r<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_8xi32_param_0]; +; SM90-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_8xi32_param_1]; +; SM90-NEXT: st.volatile.global.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_8xi32( +; SM100: { +; SM100-NEXT: .reg .b32 %r<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_8xi32_param_0]; +; SM100-NEXT: ld.volatile.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ret; + %a.load = load volatile <8 x i32>, ptr addrspace(1) %a + store volatile <8 x i32> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_4xi64( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<7>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_4xi64_param_0]; +; SM90-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd6, [global_volatile_4xi64_param_1]; +; SM90-NEXT: st.volatile.global.v2.u64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.volatile.global.v2.u64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_4xi64( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<7>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_4xi64_param_0]; +; SM100-NEXT: ld.volatile.global.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd6, [global_volatile_4xi64_param_1]; +; SM100-NEXT: st.volatile.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ret; + %a.load = load volatile <4 x i64>, ptr addrspace(1) %a + store volatile <4 x i64> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_8xfloat( +; SM90: { +; SM90-NEXT: .reg .b32 %f<9>; +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.global.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; SM90-NEXT: st.volatile.global.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_8xfloat( +; SM100: { +; SM100-NEXT: .reg .b32 %f<9>; +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.volatile.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; SM100-NEXT: ret; + %a.load = load volatile <8 x float>, ptr addrspace(1) %a + store volatile <8 x float> %a.load, ptr addrspace(1) %b + ret void +} + +define void @global_volatile_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) { +; SM90-LABEL: global_volatile_4xdouble( +; SM90: { +; SM90-NEXT: .reg .b64 %rd<3>; +; SM90-NEXT: .reg .b64 %fd<5>; +; SM90-EMPTY: +; SM90-NEXT: // %bb.0: +; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_4xdouble_param_0]; +; SM90-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_4xdouble_param_1]; +; SM90-NEXT: st.volatile.global.v2.f64 [%rd2+16], {%fd3, %fd4}; +; SM90-NEXT: st.volatile.global.v2.f64 [%rd2], {%fd1, %fd2}; +; SM90-NEXT: ret; +; +; SM100-LABEL: global_volatile_4xdouble( +; SM100: { +; SM100-NEXT: .reg .b64 %rd<3>; +; SM100-NEXT: .reg .b64 %fd<5>; +; SM100-EMPTY: +; SM100-NEXT: // %bb.0: +; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_4xdouble_param_0]; +; SM100-NEXT: ld.volatile.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_4xdouble_param_1]; +; SM100-NEXT: st.volatile.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; SM100-NEXT: ret; + %a.load = load volatile <4 x double>, ptr addrspace(1) %a + store volatile <4 x double> %a.load, ptr addrspace(1) %b + ret void +} + +;; shared statespace + +; shared + +define void @shared_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_32xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_32xi8_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_32xi8_param_1]; +; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <32 x i8>, ptr addrspace(3) %a + store <32 x i8> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_16xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi16_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_16xi16_param_1]; +; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x i16>, ptr addrspace(3) %a + store <16 x i16> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_16xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xhalf_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_16xhalf_param_1]; +; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x half>, ptr addrspace(3) %a + store <16 x half> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_16xbfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xbfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_16xbfloat_param_1]; +; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x bfloat>, ptr addrspace(3) %a + store <16 x bfloat> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi32_param_0]; +; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_8xi32_param_1]; +; CHECK-NEXT: st.shared.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.shared.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <8 x i32>, ptr addrspace(3) %a + store <8 x i32> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi64_param_0]; +; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.shared.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd6, [shared_4xi64_param_1]; +; CHECK-NEXT: st.shared.v2.u64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.shared.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load <4 x i64>, ptr addrspace(3) %a + store <4 x i64> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_8xfloat_param_1]; +; CHECK-NEXT: st.shared.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.shared.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ret; + %a.load = load <8 x float>, ptr addrspace(3) %a + store <8 x float> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xdouble_param_0]; +; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.shared.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_4xdouble_param_1]; +; CHECK-NEXT: st.shared.v2.f64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.shared.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ret; + %a.load = load <4 x double>, ptr addrspace(3) %a + store <4 x double> %a.load, ptr addrspace(3) %b + ret void +} + +; shared_volatile + +define void @shared_volatile_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_32xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_32xi8_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_32xi8_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <32 x i8>, ptr addrspace(3) %a + store volatile <32 x i8> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_16xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi16_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_16xi16_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i16>, ptr addrspace(3) %a + store volatile <16 x i16> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_16xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xhalf_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_16xhalf_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x half>, ptr addrspace(3) %a + store volatile <16 x half> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_16xbfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xbfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_16xbfloat_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x bfloat>, ptr addrspace(3) %a + store volatile <16 x bfloat> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_8xi32_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i32>, ptr addrspace(3) %a + store volatile <8 x i32> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi64_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd6, [shared_volatile_4xi64_param_1]; +; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i64>, ptr addrspace(3) %a + store volatile <4 x i64> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x float>, ptr addrspace(3) %a + store volatile <8 x float> %a.load, ptr addrspace(3) %b + ret void +} + +define void @shared_volatile_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) { +; CHECK-LABEL: shared_volatile_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xdouble_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_4xdouble_param_1]; +; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x double>, ptr addrspace(3) %a + store volatile <4 x double> %a.load, ptr addrspace(3) %b + ret void +} + +;; local statespace + +; local + +define void @local_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_32xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_32xi8_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_32xi8_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <32 x i8>, ptr addrspace(5) %a + store <32 x i8> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_16xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi16_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_16xi16_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x i16>, ptr addrspace(5) %a + store <16 x i16> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_16xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_16xhalf_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_16xhalf_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x half>, ptr addrspace(5) %a + store <16 x half> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_16xbfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_16xbfloat_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_16xbfloat_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <16 x bfloat>, ptr addrspace(5) %a + store <16 x bfloat> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_8xi32_param_1]; +; CHECK-NEXT: st.local.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load <8 x i32>, ptr addrspace(5) %a + store <8 x i32> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.local.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd6, [local_4xi64_param_1]; +; CHECK-NEXT: st.local.v2.u64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.local.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load <4 x i64>, ptr addrspace(5) %a + store <4 x i64> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_8xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_8xfloat_param_1]; +; CHECK-NEXT: st.local.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.local.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ret; + %a.load = load <8 x float>, ptr addrspace(5) %a + store <8 x float> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_4xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.local.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_4xdouble_param_1]; +; CHECK-NEXT: st.local.v2.f64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.local.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ret; + %a.load = load <4 x double>, ptr addrspace(5) %a + store <4 x double> %a.load, ptr addrspace(5) %b + ret void +} + +; local_volatile + +define void @local_volatile_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_32xi8( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_32xi8_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_32xi8_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <32 x i8>, ptr addrspace(5) %a + store volatile <32 x i8> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_16xi16( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi16_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_16xi16_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x i16>, ptr addrspace(5) %a + store volatile <16 x i16> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_16xhalf( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xhalf_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_16xhalf_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x half>, ptr addrspace(5) %a + store volatile <16 x half> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_16xbfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xbfloat_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_16xbfloat_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <16 x bfloat>, ptr addrspace(5) %a + store volatile <16 x bfloat> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_8xi32( +; CHECK: { +; CHECK-NEXT: .reg .b32 %r<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi32_param_0]; +; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_8xi32_param_1]; +; CHECK-NEXT: st.local.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x i32>, ptr addrspace(5) %a + store volatile <8 x i32> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_4xi64( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<7>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi64_param_0]; +; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.local.v2.u64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd6, [local_volatile_4xi64_param_1]; +; CHECK-NEXT: st.local.v2.u64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.local.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x i64>, ptr addrspace(5) %a + store volatile <4 x i64> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_8xfloat( +; CHECK: { +; CHECK-NEXT: .reg .b32 %f<9>; +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.local.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.local.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ret; + %a.load = load volatile <8 x float>, ptr addrspace(5) %a + store volatile <8 x float> %a.load, ptr addrspace(5) %b + ret void +} + +define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) { +; CHECK-LABEL: local_volatile_4xdouble( +; CHECK: { +; CHECK-NEXT: .reg .b64 %rd<3>; +; CHECK-NEXT: .reg .b64 %fd<5>; +; CHECK-EMPTY: +; CHECK-NEXT: // %bb.0: +; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.local.v2.f64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_4xdouble_param_1]; +; CHECK-NEXT: st.local.v2.f64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.local.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ret; + %a.load = load volatile <4 x double>, ptr addrspace(5) %a + store volatile <4 x double> %a.load, ptr addrspace(5) %b + ret void +} diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll new file mode 100644 index 0000000000000..9034c96c6a52d --- /dev/null +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll @@ -0,0 +1,728 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_90 -mattr=+ptx87 -S < %s | FileCheck %s -check-prefixes=CHECK,SM90 +; RUN: opt -mtriple=nvptx64-nvidia-cuda -passes=load-store-vectorizer -mcpu=sm_100 -mattr=+ptx88 -S < %s | FileCheck %s -check-prefixes=CHECK,SM100 + +; 256 bit loads/stores are only currently supported for: +; - global +; - blackwell (sm_100) +; - ptx 8.8 +; - 32/64-bit types + +; Currently, the LSV produces 256 bit loads/stores if the first three conditions +; are satisfied, as the backend will either upsize or split vectors +; of smaller elements in the Type Legalization stage. + +; In this file, we test i8, i16, i32, i64, f32, f64. +; The other floating point types are omitted for simplicity. +; We also test the negative case for non-global i32. + +define void @int8x32(ptr addrspace(1) %ptr) { +; SM90-LABEL: define void @int8x32( +; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; SM90-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 0 +; SM90-NEXT: [[PTR10:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 16 +; SM90-NEXT: [[TMP1:%.*]] = load <16 x i8>, ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[L01:%.*]] = extractelement <16 x i8> [[TMP1]], i32 0 +; SM90-NEXT: [[L110:%.*]] = extractelement <16 x i8> [[TMP1]], i32 1 +; SM90-NEXT: [[L211:%.*]] = extractelement <16 x i8> [[TMP1]], i32 2 +; SM90-NEXT: [[L312:%.*]] = extractelement <16 x i8> [[TMP1]], i32 3 +; SM90-NEXT: [[L413:%.*]] = extractelement <16 x i8> [[TMP1]], i32 4 +; SM90-NEXT: [[L514:%.*]] = extractelement <16 x i8> [[TMP1]], i32 5 +; SM90-NEXT: [[L615:%.*]] = extractelement <16 x i8> [[TMP1]], i32 6 +; SM90-NEXT: [[L716:%.*]] = extractelement <16 x i8> [[TMP1]], i32 7 +; SM90-NEXT: [[L817:%.*]] = extractelement <16 x i8> [[TMP1]], i32 8 +; SM90-NEXT: [[L918:%.*]] = extractelement <16 x i8> [[TMP1]], i32 9 +; SM90-NEXT: [[LA19:%.*]] = extractelement <16 x i8> [[TMP1]], i32 10 +; SM90-NEXT: [[LB20:%.*]] = extractelement <16 x i8> [[TMP1]], i32 11 +; SM90-NEXT: [[LC21:%.*]] = extractelement <16 x i8> [[TMP1]], i32 12 +; SM90-NEXT: [[LD22:%.*]] = extractelement <16 x i8> [[TMP1]], i32 13 +; SM90-NEXT: [[LE23:%.*]] = extractelement <16 x i8> [[TMP1]], i32 14 +; SM90-NEXT: [[LF24:%.*]] = extractelement <16 x i8> [[TMP1]], i32 15 +; SM90-NEXT: [[TMP2:%.*]] = load <16 x i8>, ptr addrspace(1) [[PTR10]], align 16 +; SM90-NEXT: [[L1025:%.*]] = extractelement <16 x i8> [[TMP2]], i32 0 +; SM90-NEXT: [[L1126:%.*]] = extractelement <16 x i8> [[TMP2]], i32 1 +; SM90-NEXT: [[L1227:%.*]] = extractelement <16 x i8> [[TMP2]], i32 2 +; SM90-NEXT: [[L1328:%.*]] = extractelement <16 x i8> [[TMP2]], i32 3 +; SM90-NEXT: [[L1429:%.*]] = extractelement <16 x i8> [[TMP2]], i32 4 +; SM90-NEXT: [[L1530:%.*]] = extractelement <16 x i8> [[TMP2]], i32 5 +; SM90-NEXT: [[L1631:%.*]] = extractelement <16 x i8> [[TMP2]], i32 6 +; SM90-NEXT: [[L1732:%.*]] = extractelement <16 x i8> [[TMP2]], i32 7 +; SM90-NEXT: [[L1833:%.*]] = extractelement <16 x i8> [[TMP2]], i32 8 +; SM90-NEXT: [[L1934:%.*]] = extractelement <16 x i8> [[TMP2]], i32 9 +; SM90-NEXT: [[L1A35:%.*]] = extractelement <16 x i8> [[TMP2]], i32 10 +; SM90-NEXT: [[L1B36:%.*]] = extractelement <16 x i8> [[TMP2]], i32 11 +; SM90-NEXT: [[L1C37:%.*]] = extractelement <16 x i8> [[TMP2]], i32 12 +; SM90-NEXT: [[L1D38:%.*]] = extractelement <16 x i8> [[TMP2]], i32 13 +; SM90-NEXT: [[L1E39:%.*]] = extractelement <16 x i8> [[TMP2]], i32 14 +; SM90-NEXT: [[L1F40:%.*]] = extractelement <16 x i8> [[TMP2]], i32 15 +; SM90-NEXT: [[TMP3:%.*]] = insertelement <16 x i8> poison, i8 [[LB20]], i32 0 +; SM90-NEXT: [[TMP4:%.*]] = insertelement <16 x i8> [[TMP3]], i8 [[LA19]], i32 1 +; SM90-NEXT: [[TMP5:%.*]] = insertelement <16 x i8> [[TMP4]], i8 [[L918]], i32 2 +; SM90-NEXT: [[TMP6:%.*]] = insertelement <16 x i8> [[TMP5]], i8 [[L817]], i32 3 +; SM90-NEXT: [[TMP7:%.*]] = insertelement <16 x i8> [[TMP6]], i8 [[L716]], i32 4 +; SM90-NEXT: [[TMP8:%.*]] = insertelement <16 x i8> [[TMP7]], i8 [[L615]], i32 5 +; SM90-NEXT: [[TMP9:%.*]] = insertelement <16 x i8> [[TMP8]], i8 [[L514]], i32 6 +; SM90-NEXT: [[TMP10:%.*]] = insertelement <16 x i8> [[TMP9]], i8 [[L413]], i32 7 +; SM90-NEXT: [[TMP11:%.*]] = insertelement <16 x i8> [[TMP10]], i8 [[L312]], i32 8 +; SM90-NEXT: [[TMP12:%.*]] = insertelement <16 x i8> [[TMP11]], i8 [[L211]], i32 9 +; SM90-NEXT: [[TMP13:%.*]] = insertelement <16 x i8> [[TMP12]], i8 [[L110]], i32 10 +; SM90-NEXT: [[TMP14:%.*]] = insertelement <16 x i8> [[TMP13]], i8 [[L01]], i32 11 +; SM90-NEXT: [[TMP15:%.*]] = insertelement <16 x i8> [[TMP14]], i8 [[LF24]], i32 12 +; SM90-NEXT: [[TMP16:%.*]] = insertelement <16 x i8> [[TMP15]], i8 [[LE23]], i32 13 +; SM90-NEXT: [[TMP17:%.*]] = insertelement <16 x i8> [[TMP16]], i8 [[LD22]], i32 14 +; SM90-NEXT: [[TMP18:%.*]] = insertelement <16 x i8> [[TMP17]], i8 [[LC21]], i32 15 +; SM90-NEXT: store <16 x i8> [[TMP18]], ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[TMP19:%.*]] = insertelement <16 x i8> poison, i8 [[L1B36]], i32 0 +; SM90-NEXT: [[TMP20:%.*]] = insertelement <16 x i8> [[TMP19]], i8 [[L1A35]], i32 1 +; SM90-NEXT: [[TMP21:%.*]] = insertelement <16 x i8> [[TMP20]], i8 [[L1934]], i32 2 +; SM90-NEXT: [[TMP22:%.*]] = insertelement <16 x i8> [[TMP21]], i8 [[L1833]], i32 3 +; SM90-NEXT: [[TMP23:%.*]] = insertelement <16 x i8> [[TMP22]], i8 [[L1732]], i32 4 +; SM90-NEXT: [[TMP24:%.*]] = insertelement <16 x i8> [[TMP23]], i8 [[L1631]], i32 5 +; SM90-NEXT: [[TMP25:%.*]] = insertelement <16 x i8> [[TMP24]], i8 [[L1530]], i32 6 +; SM90-NEXT: [[TMP26:%.*]] = insertelement <16 x i8> [[TMP25]], i8 [[L1429]], i32 7 +; SM90-NEXT: [[TMP27:%.*]] = insertelement <16 x i8> [[TMP26]], i8 [[L1328]], i32 8 +; SM90-NEXT: [[TMP28:%.*]] = insertelement <16 x i8> [[TMP27]], i8 [[L1227]], i32 9 +; SM90-NEXT: [[TMP29:%.*]] = insertelement <16 x i8> [[TMP28]], i8 [[L1126]], i32 10 +; SM90-NEXT: [[TMP30:%.*]] = insertelement <16 x i8> [[TMP29]], i8 [[L1025]], i32 11 +; SM90-NEXT: [[TMP31:%.*]] = insertelement <16 x i8> [[TMP30]], i8 [[L1F40]], i32 12 +; SM90-NEXT: [[TMP32:%.*]] = insertelement <16 x i8> [[TMP31]], i8 [[L1E39]], i32 13 +; SM90-NEXT: [[TMP33:%.*]] = insertelement <16 x i8> [[TMP32]], i8 [[L1D38]], i32 14 +; SM90-NEXT: [[TMP34:%.*]] = insertelement <16 x i8> [[TMP33]], i8 [[L1C37]], i32 15 +; SM90-NEXT: store <16 x i8> [[TMP34]], ptr addrspace(1) [[PTR10]], align 16 +; SM90-NEXT: ret void +; +; SM100-LABEL: define void @int8x32( +; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; SM100-NEXT: [[PTR0:%.*]] = getelementptr i8, ptr addrspace(1) [[PTR]], i64 0 +; SM100-NEXT: [[TMP1:%.*]] = load <32 x i8>, ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: [[L01:%.*]] = extractelement <32 x i8> [[TMP1]], i32 0 +; SM100-NEXT: [[L110:%.*]] = extractelement <32 x i8> [[TMP1]], i32 1 +; SM100-NEXT: [[L211:%.*]] = extractelement <32 x i8> [[TMP1]], i32 2 +; SM100-NEXT: [[L312:%.*]] = extractelement <32 x i8> [[TMP1]], i32 3 +; SM100-NEXT: [[L413:%.*]] = extractelement <32 x i8> [[TMP1]], i32 4 +; SM100-NEXT: [[L514:%.*]] = extractelement <32 x i8> [[TMP1]], i32 5 +; SM100-NEXT: [[L615:%.*]] = extractelement <32 x i8> [[TMP1]], i32 6 +; SM100-NEXT: [[L716:%.*]] = extractelement <32 x i8> [[TMP1]], i32 7 +; SM100-NEXT: [[L817:%.*]] = extractelement <32 x i8> [[TMP1]], i32 8 +; SM100-NEXT: [[L918:%.*]] = extractelement <32 x i8> [[TMP1]], i32 9 +; SM100-NEXT: [[LA19:%.*]] = extractelement <32 x i8> [[TMP1]], i32 10 +; SM100-NEXT: [[LB20:%.*]] = extractelement <32 x i8> [[TMP1]], i32 11 +; SM100-NEXT: [[LC21:%.*]] = extractelement <32 x i8> [[TMP1]], i32 12 +; SM100-NEXT: [[LD22:%.*]] = extractelement <32 x i8> [[TMP1]], i32 13 +; SM100-NEXT: [[LE23:%.*]] = extractelement <32 x i8> [[TMP1]], i32 14 +; SM100-NEXT: [[LF24:%.*]] = extractelement <32 x i8> [[TMP1]], i32 15 +; SM100-NEXT: [[L1025:%.*]] = extractelement <32 x i8> [[TMP1]], i32 16 +; SM100-NEXT: [[L1126:%.*]] = extractelement <32 x i8> [[TMP1]], i32 17 +; SM100-NEXT: [[L1227:%.*]] = extractelement <32 x i8> [[TMP1]], i32 18 +; SM100-NEXT: [[L1328:%.*]] = extractelement <32 x i8> [[TMP1]], i32 19 +; SM100-NEXT: [[L1429:%.*]] = extractelement <32 x i8> [[TMP1]], i32 20 +; SM100-NEXT: [[L1530:%.*]] = extractelement <32 x i8> [[TMP1]], i32 21 +; SM100-NEXT: [[L1631:%.*]] = extractelement <32 x i8> [[TMP1]], i32 22 +; SM100-NEXT: [[L1732:%.*]] = extractelement <32 x i8> [[TMP1]], i32 23 +; SM100-NEXT: [[L1833:%.*]] = extractelement <32 x i8> [[TMP1]], i32 24 +; SM100-NEXT: [[L1934:%.*]] = extractelement <32 x i8> [[TMP1]], i32 25 +; SM100-NEXT: [[L1A35:%.*]] = extractelement <32 x i8> [[TMP1]], i32 26 +; SM100-NEXT: [[L1B36:%.*]] = extractelement <32 x i8> [[TMP1]], i32 27 +; SM100-NEXT: [[L1C37:%.*]] = extractelement <32 x i8> [[TMP1]], i32 28 +; SM100-NEXT: [[L1D38:%.*]] = extractelement <32 x i8> [[TMP1]], i32 29 +; SM100-NEXT: [[L1E39:%.*]] = extractelement <32 x i8> [[TMP1]], i32 30 +; SM100-NEXT: [[L1F40:%.*]] = extractelement <32 x i8> [[TMP1]], i32 31 +; SM100-NEXT: [[TMP2:%.*]] = insertelement <32 x i8> poison, i8 [[LB20]], i32 0 +; SM100-NEXT: [[TMP3:%.*]] = insertelement <32 x i8> [[TMP2]], i8 [[LA19]], i32 1 +; SM100-NEXT: [[TMP4:%.*]] = insertelement <32 x i8> [[TMP3]], i8 [[L918]], i32 2 +; SM100-NEXT: [[TMP5:%.*]] = insertelement <32 x i8> [[TMP4]], i8 [[L817]], i32 3 +; SM100-NEXT: [[TMP6:%.*]] = insertelement <32 x i8> [[TMP5]], i8 [[L716]], i32 4 +; SM100-NEXT: [[TMP7:%.*]] = insertelement <32 x i8> [[TMP6]], i8 [[L615]], i32 5 +; SM100-NEXT: [[TMP8:%.*]] = insertelement <32 x i8> [[TMP7]], i8 [[L514]], i32 6 +; SM100-NEXT: [[TMP9:%.*]] = insertelement <32 x i8> [[TMP8]], i8 [[L413]], i32 7 +; SM100-NEXT: [[TMP10:%.*]] = insertelement <32 x i8> [[TMP9]], i8 [[L312]], i32 8 +; SM100-NEXT: [[TMP11:%.*]] = insertelement <32 x i8> [[TMP10]], i8 [[L211]], i32 9 +; SM100-NEXT: [[TMP12:%.*]] = insertelement <32 x i8> [[TMP11]], i8 [[L110]], i32 10 +; SM100-NEXT: [[TMP13:%.*]] = insertelement <32 x i8> [[TMP12]], i8 [[L01]], i32 11 +; SM100-NEXT: [[TMP14:%.*]] = insertelement <32 x i8> [[TMP13]], i8 [[LF24]], i32 12 +; SM100-NEXT: [[TMP15:%.*]] = insertelement <32 x i8> [[TMP14]], i8 [[LE23]], i32 13 +; SM100-NEXT: [[TMP16:%.*]] = insertelement <32 x i8> [[TMP15]], i8 [[LD22]], i32 14 +; SM100-NEXT: [[TMP17:%.*]] = insertelement <32 x i8> [[TMP16]], i8 [[LC21]], i32 15 +; SM100-NEXT: [[TMP18:%.*]] = insertelement <32 x i8> [[TMP17]], i8 [[L1B36]], i32 16 +; SM100-NEXT: [[TMP19:%.*]] = insertelement <32 x i8> [[TMP18]], i8 [[L1A35]], i32 17 +; SM100-NEXT: [[TMP20:%.*]] = insertelement <32 x i8> [[TMP19]], i8 [[L1934]], i32 18 +; SM100-NEXT: [[TMP21:%.*]] = insertelement <32 x i8> [[TMP20]], i8 [[L1833]], i32 19 +; SM100-NEXT: [[TMP22:%.*]] = insertelement <32 x i8> [[TMP21]], i8 [[L1732]], i32 20 +; SM100-NEXT: [[TMP23:%.*]] = insertelement <32 x i8> [[TMP22]], i8 [[L1631]], i32 21 +; SM100-NEXT: [[TMP24:%.*]] = insertelement <32 x i8> [[TMP23]], i8 [[L1530]], i32 22 +; SM100-NEXT: [[TMP25:%.*]] = insertelement <32 x i8> [[TMP24]], i8 [[L1429]], i32 23 +; SM100-NEXT: [[TMP26:%.*]] = insertelement <32 x i8> [[TMP25]], i8 [[L1328]], i32 24 +; SM100-NEXT: [[TMP27:%.*]] = insertelement <32 x i8> [[TMP26]], i8 [[L1227]], i32 25 +; SM100-NEXT: [[TMP28:%.*]] = insertelement <32 x i8> [[TMP27]], i8 [[L1126]], i32 26 +; SM100-NEXT: [[TMP29:%.*]] = insertelement <32 x i8> [[TMP28]], i8 [[L1025]], i32 27 +; SM100-NEXT: [[TMP30:%.*]] = insertelement <32 x i8> [[TMP29]], i8 [[L1F40]], i32 28 +; SM100-NEXT: [[TMP31:%.*]] = insertelement <32 x i8> [[TMP30]], i8 [[L1E39]], i32 29 +; SM100-NEXT: [[TMP32:%.*]] = insertelement <32 x i8> [[TMP31]], i8 [[L1D38]], i32 30 +; SM100-NEXT: [[TMP33:%.*]] = insertelement <32 x i8> [[TMP32]], i8 [[L1C37]], i32 31 +; SM100-NEXT: store <32 x i8> [[TMP33]], ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: ret void +; + + %ptr0 = getelementptr i8, ptr addrspace(1) %ptr, i64 0 + %ptr1 = getelementptr i8, ptr addrspace(1) %ptr, i64 1 + %ptr2 = getelementptr i8, ptr addrspace(1) %ptr, i64 2 + %ptr3 = getelementptr i8, ptr addrspace(1) %ptr, i64 3 + %ptr4 = getelementptr i8, ptr addrspace(1) %ptr, i64 4 + %ptr5 = getelementptr i8, ptr addrspace(1) %ptr, i64 5 + %ptr6 = getelementptr i8, ptr addrspace(1) %ptr, i64 6 + %ptr7 = getelementptr i8, ptr addrspace(1) %ptr, i64 7 + %ptr8 = getelementptr i8, ptr addrspace(1) %ptr, i64 8 + %ptr9 = getelementptr i8, ptr addrspace(1) %ptr, i64 9 + %ptra = getelementptr i8, ptr addrspace(1) %ptr, i64 10 + %ptrb = getelementptr i8, ptr addrspace(1) %ptr, i64 11 + %ptrc = getelementptr i8, ptr addrspace(1) %ptr, i64 12 + %ptrd = getelementptr i8, ptr addrspace(1) %ptr, i64 13 + %ptre = getelementptr i8, ptr addrspace(1) %ptr, i64 14 + %ptrf = getelementptr i8, ptr addrspace(1) %ptr, i64 15 + %ptr10 = getelementptr i8, ptr addrspace(1) %ptr, i64 16 + %ptr11 = getelementptr i8, ptr addrspace(1) %ptr, i64 17 + %ptr12 = getelementptr i8, ptr addrspace(1) %ptr, i64 18 + %ptr13 = getelementptr i8, ptr addrspace(1) %ptr, i64 19 + %ptr14 = getelementptr i8, ptr addrspace(1) %ptr, i64 20 + %ptr15 = getelementptr i8, ptr addrspace(1) %ptr, i64 21 + %ptr16 = getelementptr i8, ptr addrspace(1) %ptr, i64 22 + %ptr17 = getelementptr i8, ptr addrspace(1) %ptr, i64 23 + %ptr18 = getelementptr i8, ptr addrspace(1) %ptr, i64 24 + %ptr19 = getelementptr i8, ptr addrspace(1) %ptr, i64 25 + %ptr1a = getelementptr i8, ptr addrspace(1) %ptr, i64 26 + %ptr1b = getelementptr i8, ptr addrspace(1) %ptr, i64 27 + %ptr1c = getelementptr i8, ptr addrspace(1) %ptr, i64 28 + %ptr1d = getelementptr i8, ptr addrspace(1) %ptr, i64 29 + %ptr1e = getelementptr i8, ptr addrspace(1) %ptr, i64 30 + %ptr1f = getelementptr i8, ptr addrspace(1) %ptr, i64 31 + + %l0 = load i8, ptr addrspace(1) %ptr0, align 32 + %l1 = load i8, ptr addrspace(1) %ptr1, align 1 + %l2 = load i8, ptr addrspace(1) %ptr2, align 1 + %l3 = load i8, ptr addrspace(1) %ptr3, align 1 + %l4 = load i8, ptr addrspace(1) %ptr4, align 1 + %l5 = load i8, ptr addrspace(1) %ptr5, align 1 + %l6 = load i8, ptr addrspace(1) %ptr6, align 1 + %l7 = load i8, ptr addrspace(1) %ptr7, align 1 + %l8 = load i8, ptr addrspace(1) %ptr8, align 1 + %l9 = load i8, ptr addrspace(1) %ptr9, align 1 + %la = load i8, ptr addrspace(1) %ptra, align 1 + %lb = load i8, ptr addrspace(1) %ptrb, align 1 + %lc = load i8, ptr addrspace(1) %ptrc, align 1 + %ld = load i8, ptr addrspace(1) %ptrd, align 1 + %le = load i8, ptr addrspace(1) %ptre, align 1 + %lf = load i8, ptr addrspace(1) %ptrf, align 1 + %l10 = load i8, ptr addrspace(1) %ptr10, align 16 + %l11 = load i8, ptr addrspace(1) %ptr11, align 1 + %l12 = load i8, ptr addrspace(1) %ptr12, align 1 + %l13 = load i8, ptr addrspace(1) %ptr13, align 1 + %l14 = load i8, ptr addrspace(1) %ptr14, align 1 + %l15 = load i8, ptr addrspace(1) %ptr15, align 1 + %l16 = load i8, ptr addrspace(1) %ptr16, align 1 + %l17 = load i8, ptr addrspace(1) %ptr17, align 1 + %l18 = load i8, ptr addrspace(1) %ptr18, align 1 + %l19 = load i8, ptr addrspace(1) %ptr19, align 1 + %l1a = load i8, ptr addrspace(1) %ptr1a, align 1 + %l1b = load i8, ptr addrspace(1) %ptr1b, align 1 + %l1c = load i8, ptr addrspace(1) %ptr1c, align 1 + %l1d = load i8, ptr addrspace(1) %ptr1d, align 1 + %l1e = load i8, ptr addrspace(1) %ptr1e, align 1 + %l1f = load i8, ptr addrspace(1) %ptr1f, align 1 + + store i8 %lf, ptr addrspace(1) %ptrc, align 1 + store i8 %le, ptr addrspace(1) %ptrd, align 1 + store i8 %ld, ptr addrspace(1) %ptre, align 1 + store i8 %lc, ptr addrspace(1) %ptrf, align 1 + store i8 %lb, ptr addrspace(1) %ptr0, align 32 + store i8 %la, ptr addrspace(1) %ptr1, align 1 + store i8 %l9, ptr addrspace(1) %ptr2, align 1 + store i8 %l8, ptr addrspace(1) %ptr3, align 1 + store i8 %l7, ptr addrspace(1) %ptr4, align 1 + store i8 %l6, ptr addrspace(1) %ptr5, align 1 + store i8 %l5, ptr addrspace(1) %ptr6, align 1 + store i8 %l4, ptr addrspace(1) %ptr7, align 1 + store i8 %l3, ptr addrspace(1) %ptr8, align 1 + store i8 %l2, ptr addrspace(1) %ptr9, align 1 + store i8 %l1, ptr addrspace(1) %ptra, align 1 + store i8 %l0, ptr addrspace(1) %ptrb, align 1 + store i8 %l1f, ptr addrspace(1) %ptr1c, align 1 + store i8 %l1e, ptr addrspace(1) %ptr1d, align 1 + store i8 %l1d, ptr addrspace(1) %ptr1e, align 1 + store i8 %l1c, ptr addrspace(1) %ptr1f, align 1 + store i8 %l1b, ptr addrspace(1) %ptr10, align 16 + store i8 %l1a, ptr addrspace(1) %ptr11, align 1 + store i8 %l19, ptr addrspace(1) %ptr12, align 1 + store i8 %l18, ptr addrspace(1) %ptr13, align 1 + store i8 %l17, ptr addrspace(1) %ptr14, align 1 + store i8 %l16, ptr addrspace(1) %ptr15, align 1 + store i8 %l15, ptr addrspace(1) %ptr16, align 1 + store i8 %l14, ptr addrspace(1) %ptr17, align 1 + store i8 %l13, ptr addrspace(1) %ptr18, align 1 + store i8 %l12, ptr addrspace(1) %ptr19, align 1 + store i8 %l11, ptr addrspace(1) %ptr1a, align 1 + store i8 %l10, ptr addrspace(1) %ptr1b, align 1 + + ret void +} + +define void @int16x16(ptr addrspace(1) %ptr) { +; SM90-LABEL: define void @int16x16( +; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM90-NEXT: [[PTR0:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 0 +; SM90-NEXT: [[PTR8:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 8 +; SM90-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[L01:%.*]] = extractelement <8 x i16> [[TMP1]], i32 0 +; SM90-NEXT: [[L12:%.*]] = extractelement <8 x i16> [[TMP1]], i32 1 +; SM90-NEXT: [[L23:%.*]] = extractelement <8 x i16> [[TMP1]], i32 2 +; SM90-NEXT: [[L34:%.*]] = extractelement <8 x i16> [[TMP1]], i32 3 +; SM90-NEXT: [[L45:%.*]] = extractelement <8 x i16> [[TMP1]], i32 4 +; SM90-NEXT: [[L56:%.*]] = extractelement <8 x i16> [[TMP1]], i32 5 +; SM90-NEXT: [[L67:%.*]] = extractelement <8 x i16> [[TMP1]], i32 6 +; SM90-NEXT: [[L78:%.*]] = extractelement <8 x i16> [[TMP1]], i32 7 +; SM90-NEXT: [[TMP2:%.*]] = load <8 x i16>, ptr addrspace(1) [[PTR8]], align 16 +; SM90-NEXT: [[L89:%.*]] = extractelement <8 x i16> [[TMP2]], i32 0 +; SM90-NEXT: [[L910:%.*]] = extractelement <8 x i16> [[TMP2]], i32 1 +; SM90-NEXT: [[LA11:%.*]] = extractelement <8 x i16> [[TMP2]], i32 2 +; SM90-NEXT: [[LB12:%.*]] = extractelement <8 x i16> [[TMP2]], i32 3 +; SM90-NEXT: [[LC13:%.*]] = extractelement <8 x i16> [[TMP2]], i32 4 +; SM90-NEXT: [[LD14:%.*]] = extractelement <8 x i16> [[TMP2]], i32 5 +; SM90-NEXT: [[LE15:%.*]] = extractelement <8 x i16> [[TMP2]], i32 6 +; SM90-NEXT: [[LF16:%.*]] = extractelement <8 x i16> [[TMP2]], i32 7 +; SM90-NEXT: [[TMP3:%.*]] = insertelement <8 x i16> poison, i16 [[LB12]], i32 0 +; SM90-NEXT: [[TMP4:%.*]] = insertelement <8 x i16> [[TMP3]], i16 [[LA11]], i32 1 +; SM90-NEXT: [[TMP5:%.*]] = insertelement <8 x i16> [[TMP4]], i16 [[L910]], i32 2 +; SM90-NEXT: [[TMP6:%.*]] = insertelement <8 x i16> [[TMP5]], i16 [[L89]], i32 3 +; SM90-NEXT: [[TMP7:%.*]] = insertelement <8 x i16> [[TMP6]], i16 [[L78]], i32 4 +; SM90-NEXT: [[TMP8:%.*]] = insertelement <8 x i16> [[TMP7]], i16 [[L67]], i32 5 +; SM90-NEXT: [[TMP9:%.*]] = insertelement <8 x i16> [[TMP8]], i16 [[L56]], i32 6 +; SM90-NEXT: [[TMP10:%.*]] = insertelement <8 x i16> [[TMP9]], i16 [[L45]], i32 7 +; SM90-NEXT: store <8 x i16> [[TMP10]], ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[TMP11:%.*]] = insertelement <8 x i16> poison, i16 [[L34]], i32 0 +; SM90-NEXT: [[TMP12:%.*]] = insertelement <8 x i16> [[TMP11]], i16 [[L23]], i32 1 +; SM90-NEXT: [[TMP13:%.*]] = insertelement <8 x i16> [[TMP12]], i16 [[L12]], i32 2 +; SM90-NEXT: [[TMP14:%.*]] = insertelement <8 x i16> [[TMP13]], i16 [[L01]], i32 3 +; SM90-NEXT: [[TMP15:%.*]] = insertelement <8 x i16> [[TMP14]], i16 [[LF16]], i32 4 +; SM90-NEXT: [[TMP16:%.*]] = insertelement <8 x i16> [[TMP15]], i16 [[LE15]], i32 5 +; SM90-NEXT: [[TMP17:%.*]] = insertelement <8 x i16> [[TMP16]], i16 [[LD14]], i32 6 +; SM90-NEXT: [[TMP18:%.*]] = insertelement <8 x i16> [[TMP17]], i16 [[LC13]], i32 7 +; SM90-NEXT: store <8 x i16> [[TMP18]], ptr addrspace(1) [[PTR8]], align 16 +; SM90-NEXT: ret void +; +; SM100-LABEL: define void @int16x16( +; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM100-NEXT: [[PTR0:%.*]] = getelementptr i16, ptr addrspace(1) [[PTR]], i64 0 +; SM100-NEXT: [[TMP1:%.*]] = load <16 x i16>, ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: [[L01:%.*]] = extractelement <16 x i16> [[TMP1]], i32 0 +; SM100-NEXT: [[L12:%.*]] = extractelement <16 x i16> [[TMP1]], i32 1 +; SM100-NEXT: [[L23:%.*]] = extractelement <16 x i16> [[TMP1]], i32 2 +; SM100-NEXT: [[L34:%.*]] = extractelement <16 x i16> [[TMP1]], i32 3 +; SM100-NEXT: [[L45:%.*]] = extractelement <16 x i16> [[TMP1]], i32 4 +; SM100-NEXT: [[L56:%.*]] = extractelement <16 x i16> [[TMP1]], i32 5 +; SM100-NEXT: [[L67:%.*]] = extractelement <16 x i16> [[TMP1]], i32 6 +; SM100-NEXT: [[L78:%.*]] = extractelement <16 x i16> [[TMP1]], i32 7 +; SM100-NEXT: [[L89:%.*]] = extractelement <16 x i16> [[TMP1]], i32 8 +; SM100-NEXT: [[L910:%.*]] = extractelement <16 x i16> [[TMP1]], i32 9 +; SM100-NEXT: [[LA11:%.*]] = extractelement <16 x i16> [[TMP1]], i32 10 +; SM100-NEXT: [[LB12:%.*]] = extractelement <16 x i16> [[TMP1]], i32 11 +; SM100-NEXT: [[LC13:%.*]] = extractelement <16 x i16> [[TMP1]], i32 12 +; SM100-NEXT: [[LD14:%.*]] = extractelement <16 x i16> [[TMP1]], i32 13 +; SM100-NEXT: [[LE15:%.*]] = extractelement <16 x i16> [[TMP1]], i32 14 +; SM100-NEXT: [[LF16:%.*]] = extractelement <16 x i16> [[TMP1]], i32 15 +; SM100-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> poison, i16 [[LB12]], i32 0 +; SM100-NEXT: [[TMP3:%.*]] = insertelement <16 x i16> [[TMP2]], i16 [[LA11]], i32 1 +; SM100-NEXT: [[TMP4:%.*]] = insertelement <16 x i16> [[TMP3]], i16 [[L910]], i32 2 +; SM100-NEXT: [[TMP5:%.*]] = insertelement <16 x i16> [[TMP4]], i16 [[L89]], i32 3 +; SM100-NEXT: [[TMP6:%.*]] = insertelement <16 x i16> [[TMP5]], i16 [[L78]], i32 4 +; SM100-NEXT: [[TMP7:%.*]] = insertelement <16 x i16> [[TMP6]], i16 [[L67]], i32 5 +; SM100-NEXT: [[TMP8:%.*]] = insertelement <16 x i16> [[TMP7]], i16 [[L56]], i32 6 +; SM100-NEXT: [[TMP9:%.*]] = insertelement <16 x i16> [[TMP8]], i16 [[L45]], i32 7 +; SM100-NEXT: [[TMP10:%.*]] = insertelement <16 x i16> [[TMP9]], i16 [[L34]], i32 8 +; SM100-NEXT: [[TMP11:%.*]] = insertelement <16 x i16> [[TMP10]], i16 [[L23]], i32 9 +; SM100-NEXT: [[TMP12:%.*]] = insertelement <16 x i16> [[TMP11]], i16 [[L12]], i32 10 +; SM100-NEXT: [[TMP13:%.*]] = insertelement <16 x i16> [[TMP12]], i16 [[L01]], i32 11 +; SM100-NEXT: [[TMP14:%.*]] = insertelement <16 x i16> [[TMP13]], i16 [[LF16]], i32 12 +; SM100-NEXT: [[TMP15:%.*]] = insertelement <16 x i16> [[TMP14]], i16 [[LE15]], i32 13 +; SM100-NEXT: [[TMP16:%.*]] = insertelement <16 x i16> [[TMP15]], i16 [[LD14]], i32 14 +; SM100-NEXT: [[TMP17:%.*]] = insertelement <16 x i16> [[TMP16]], i16 [[LC13]], i32 15 +; SM100-NEXT: store <16 x i16> [[TMP17]], ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: ret void +; + + %ptr0 = getelementptr i16, ptr addrspace(1) %ptr, i64 0 + %ptr1 = getelementptr i16, ptr addrspace(1) %ptr, i64 1 + %ptr2 = getelementptr i16, ptr addrspace(1) %ptr, i64 2 + %ptr3 = getelementptr i16, ptr addrspace(1) %ptr, i64 3 + %ptr4 = getelementptr i16, ptr addrspace(1) %ptr, i64 4 + %ptr5 = getelementptr i16, ptr addrspace(1) %ptr, i64 5 + %ptr6 = getelementptr i16, ptr addrspace(1) %ptr, i64 6 + %ptr7 = getelementptr i16, ptr addrspace(1) %ptr, i64 7 + %ptr8 = getelementptr i16, ptr addrspace(1) %ptr, i64 8 + %ptr9 = getelementptr i16, ptr addrspace(1) %ptr, i64 9 + %ptra = getelementptr i16, ptr addrspace(1) %ptr, i64 10 + %ptrb = getelementptr i16, ptr addrspace(1) %ptr, i64 11 + %ptrc = getelementptr i16, ptr addrspace(1) %ptr, i64 12 + %ptrd = getelementptr i16, ptr addrspace(1) %ptr, i64 13 + %ptre = getelementptr i16, ptr addrspace(1) %ptr, i64 14 + %ptrf = getelementptr i16, ptr addrspace(1) %ptr, i64 15 + + %l0 = load i16, ptr addrspace(1) %ptr0, align 32 + %l1 = load i16, ptr addrspace(1) %ptr1, align 1 + %l2 = load i16, ptr addrspace(1) %ptr2, align 1 + %l3 = load i16, ptr addrspace(1) %ptr3, align 1 + %l4 = load i16, ptr addrspace(1) %ptr4, align 1 + %l5 = load i16, ptr addrspace(1) %ptr5, align 1 + %l6 = load i16, ptr addrspace(1) %ptr6, align 1 + %l7 = load i16, ptr addrspace(1) %ptr7, align 1 + %l8 = load i16, ptr addrspace(1) %ptr8, align 16 + %l9 = load i16, ptr addrspace(1) %ptr9, align 1 + %la = load i16, ptr addrspace(1) %ptra, align 1 + %lb = load i16, ptr addrspace(1) %ptrb, align 1 + %lc = load i16, ptr addrspace(1) %ptrc, align 1 + %ld = load i16, ptr addrspace(1) %ptrd, align 1 + %le = load i16, ptr addrspace(1) %ptre, align 1 + %lf = load i16, ptr addrspace(1) %ptrf, align 1 + + store i16 %lf, ptr addrspace(1) %ptrc, align 1 + store i16 %le, ptr addrspace(1) %ptrd, align 1 + store i16 %ld, ptr addrspace(1) %ptre, align 1 + store i16 %lc, ptr addrspace(1) %ptrf, align 1 + store i16 %lb, ptr addrspace(1) %ptr0, align 32 + store i16 %la, ptr addrspace(1) %ptr1, align 1 + store i16 %l9, ptr addrspace(1) %ptr2, align 1 + store i16 %l8, ptr addrspace(1) %ptr3, align 1 + store i16 %l7, ptr addrspace(1) %ptr4, align 1 + store i16 %l6, ptr addrspace(1) %ptr5, align 1 + store i16 %l5, ptr addrspace(1) %ptr6, align 1 + store i16 %l4, ptr addrspace(1) %ptr7, align 1 + store i16 %l3, ptr addrspace(1) %ptr8, align 16 + store i16 %l2, ptr addrspace(1) %ptr9, align 1 + store i16 %l1, ptr addrspace(1) %ptra, align 1 + store i16 %l0, ptr addrspace(1) %ptrb, align 1 + + ret void +} + +define void @int32x8(ptr addrspace(1) %ptr) { +; SM90-LABEL: define void @int32x8( +; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM90-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 0 +; SM90-NEXT: [[PTR4:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 4 +; SM90-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[L01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; SM90-NEXT: [[L12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; SM90-NEXT: [[L23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; SM90-NEXT: [[L34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; SM90-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr addrspace(1) [[PTR4]], align 16 +; SM90-NEXT: [[L45:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; SM90-NEXT: [[L56:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; SM90-NEXT: [[L67:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; SM90-NEXT: [[L78:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; SM90-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[L78]], i32 0 +; SM90-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[L67]], i32 1 +; SM90-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[L56]], i32 2 +; SM90-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[L45]], i32 3 +; SM90-NEXT: store <4 x i32> [[TMP6]], ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[L34]], i32 0 +; SM90-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[L23]], i32 1 +; SM90-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[L12]], i32 2 +; SM90-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[L01]], i32 3 +; SM90-NEXT: store <4 x i32> [[TMP10]], ptr addrspace(1) [[PTR4]], align 16 +; SM90-NEXT: ret void +; +; SM100-LABEL: define void @int32x8( +; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM100-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr addrspace(1) [[PTR]], i64 0 +; SM100-NEXT: [[TMP1:%.*]] = load <8 x i32>, ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: [[L01:%.*]] = extractelement <8 x i32> [[TMP1]], i32 0 +; SM100-NEXT: [[L12:%.*]] = extractelement <8 x i32> [[TMP1]], i32 1 +; SM100-NEXT: [[L23:%.*]] = extractelement <8 x i32> [[TMP1]], i32 2 +; SM100-NEXT: [[L34:%.*]] = extractelement <8 x i32> [[TMP1]], i32 3 +; SM100-NEXT: [[L45:%.*]] = extractelement <8 x i32> [[TMP1]], i32 4 +; SM100-NEXT: [[L56:%.*]] = extractelement <8 x i32> [[TMP1]], i32 5 +; SM100-NEXT: [[L67:%.*]] = extractelement <8 x i32> [[TMP1]], i32 6 +; SM100-NEXT: [[L78:%.*]] = extractelement <8 x i32> [[TMP1]], i32 7 +; SM100-NEXT: [[TMP2:%.*]] = insertelement <8 x i32> poison, i32 [[L78]], i32 0 +; SM100-NEXT: [[TMP3:%.*]] = insertelement <8 x i32> [[TMP2]], i32 [[L67]], i32 1 +; SM100-NEXT: [[TMP4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[L56]], i32 2 +; SM100-NEXT: [[TMP5:%.*]] = insertelement <8 x i32> [[TMP4]], i32 [[L45]], i32 3 +; SM100-NEXT: [[TMP6:%.*]] = insertelement <8 x i32> [[TMP5]], i32 [[L34]], i32 4 +; SM100-NEXT: [[TMP7:%.*]] = insertelement <8 x i32> [[TMP6]], i32 [[L23]], i32 5 +; SM100-NEXT: [[TMP8:%.*]] = insertelement <8 x i32> [[TMP7]], i32 [[L12]], i32 6 +; SM100-NEXT: [[TMP9:%.*]] = insertelement <8 x i32> [[TMP8]], i32 [[L01]], i32 7 +; SM100-NEXT: store <8 x i32> [[TMP9]], ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: ret void +; + + %ptr0 = getelementptr i32, ptr addrspace(1) %ptr, i64 0 + %ptr1 = getelementptr i32, ptr addrspace(1) %ptr, i64 1 + %ptr2 = getelementptr i32, ptr addrspace(1) %ptr, i64 2 + %ptr3 = getelementptr i32, ptr addrspace(1) %ptr, i64 3 + %ptr4 = getelementptr i32, ptr addrspace(1) %ptr, i64 4 + %ptr5 = getelementptr i32, ptr addrspace(1) %ptr, i64 5 + %ptr6 = getelementptr i32, ptr addrspace(1) %ptr, i64 6 + %ptr7 = getelementptr i32, ptr addrspace(1) %ptr, i64 7 + + %l0 = load i32, ptr addrspace(1) %ptr0, align 32 + %l1 = load i32, ptr addrspace(1) %ptr1, align 1 + %l2 = load i32, ptr addrspace(1) %ptr2, align 1 + %l3 = load i32, ptr addrspace(1) %ptr3, align 1 + %l4 = load i32, ptr addrspace(1) %ptr4, align 16 + %l5 = load i32, ptr addrspace(1) %ptr5, align 1 + %l6 = load i32, ptr addrspace(1) %ptr6, align 1 + %l7 = load i32, ptr addrspace(1) %ptr7, align 1 + + store i32 %l7, ptr addrspace(1) %ptr0, align 32 + store i32 %l6, ptr addrspace(1) %ptr1, align 1 + store i32 %l5, ptr addrspace(1) %ptr2, align 1 + store i32 %l4, ptr addrspace(1) %ptr3, align 1 + store i32 %l3, ptr addrspace(1) %ptr4, align 16 + store i32 %l2, ptr addrspace(1) %ptr5, align 1 + store i32 %l1, ptr addrspace(1) %ptr6, align 1 + store i32 %l0, ptr addrspace(1) %ptr7, align 1 + + ret void +} + +define void @int64x4(ptr addrspace(1) %ptr) { +; SM90-LABEL: define void @int64x4( +; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM90-NEXT: [[PTR0:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 0 +; SM90-NEXT: [[PTR2:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 2 +; SM90-NEXT: [[TMP1:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[L01:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0 +; SM90-NEXT: [[L12:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1 +; SM90-NEXT: [[TMP2:%.*]] = load <2 x i64>, ptr addrspace(1) [[PTR2]], align 16 +; SM90-NEXT: [[L23:%.*]] = extractelement <2 x i64> [[TMP2]], i32 0 +; SM90-NEXT: [[L34:%.*]] = extractelement <2 x i64> [[TMP2]], i32 1 +; SM90-NEXT: [[TMP3:%.*]] = insertelement <2 x i64> poison, i64 [[L34]], i32 0 +; SM90-NEXT: [[TMP4:%.*]] = insertelement <2 x i64> [[TMP3]], i64 [[L23]], i32 1 +; SM90-NEXT: store <2 x i64> [[TMP4]], ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[TMP5:%.*]] = insertelement <2 x i64> poison, i64 [[L12]], i32 0 +; SM90-NEXT: [[TMP6:%.*]] = insertelement <2 x i64> [[TMP5]], i64 [[L01]], i32 1 +; SM90-NEXT: store <2 x i64> [[TMP6]], ptr addrspace(1) [[PTR2]], align 16 +; SM90-NEXT: ret void +; +; SM100-LABEL: define void @int64x4( +; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM100-NEXT: [[PTR0:%.*]] = getelementptr i64, ptr addrspace(1) [[PTR]], i64 0 +; SM100-NEXT: [[TMP1:%.*]] = load <4 x i64>, ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: [[L01:%.*]] = extractelement <4 x i64> [[TMP1]], i32 0 +; SM100-NEXT: [[L12:%.*]] = extractelement <4 x i64> [[TMP1]], i32 1 +; SM100-NEXT: [[L23:%.*]] = extractelement <4 x i64> [[TMP1]], i32 2 +; SM100-NEXT: [[L34:%.*]] = extractelement <4 x i64> [[TMP1]], i32 3 +; SM100-NEXT: [[TMP2:%.*]] = insertelement <4 x i64> poison, i64 [[L34]], i32 0 +; SM100-NEXT: [[TMP3:%.*]] = insertelement <4 x i64> [[TMP2]], i64 [[L23]], i32 1 +; SM100-NEXT: [[TMP4:%.*]] = insertelement <4 x i64> [[TMP3]], i64 [[L12]], i32 2 +; SM100-NEXT: [[TMP5:%.*]] = insertelement <4 x i64> [[TMP4]], i64 [[L01]], i32 3 +; SM100-NEXT: store <4 x i64> [[TMP5]], ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: ret void +; + + %ptr0 = getelementptr i64, ptr addrspace(1) %ptr, i64 0 + %ptr1 = getelementptr i64, ptr addrspace(1) %ptr, i64 1 + %ptr2 = getelementptr i64, ptr addrspace(1) %ptr, i64 2 + %ptr3 = getelementptr i64, ptr addrspace(1) %ptr, i64 3 + + %l0 = load i64, ptr addrspace(1) %ptr0, align 32 + %l1 = load i64, ptr addrspace(1) %ptr1, align 1 + %l2 = load i64, ptr addrspace(1) %ptr2, align 16 + %l3 = load i64, ptr addrspace(1) %ptr3, align 1 + + store i64 %l3, ptr addrspace(1) %ptr0, align 32 + store i64 %l2, ptr addrspace(1) %ptr1, align 1 + store i64 %l1, ptr addrspace(1) %ptr2, align 16 + store i64 %l0, ptr addrspace(1) %ptr3, align 1 + + ret void +} + +define void @float32x8(ptr addrspace(1) %ptr) { +; SM90-LABEL: define void @float32x8( +; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM90-NEXT: [[PTR0:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 0 +; SM90-NEXT: [[PTR4:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 4 +; SM90-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[L01:%.*]] = extractelement <4 x float> [[TMP1]], i32 0 +; SM90-NEXT: [[L12:%.*]] = extractelement <4 x float> [[TMP1]], i32 1 +; SM90-NEXT: [[L23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2 +; SM90-NEXT: [[L34:%.*]] = extractelement <4 x float> [[TMP1]], i32 3 +; SM90-NEXT: [[TMP2:%.*]] = load <4 x float>, ptr addrspace(1) [[PTR4]], align 16 +; SM90-NEXT: [[L45:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 +; SM90-NEXT: [[L56:%.*]] = extractelement <4 x float> [[TMP2]], i32 1 +; SM90-NEXT: [[L67:%.*]] = extractelement <4 x float> [[TMP2]], i32 2 +; SM90-NEXT: [[L78:%.*]] = extractelement <4 x float> [[TMP2]], i32 3 +; SM90-NEXT: [[TMP3:%.*]] = insertelement <4 x float> poison, float [[L78]], i32 0 +; SM90-NEXT: [[TMP4:%.*]] = insertelement <4 x float> [[TMP3]], float [[L67]], i32 1 +; SM90-NEXT: [[TMP5:%.*]] = insertelement <4 x float> [[TMP4]], float [[L56]], i32 2 +; SM90-NEXT: [[TMP6:%.*]] = insertelement <4 x float> [[TMP5]], float [[L45]], i32 3 +; SM90-NEXT: store <4 x float> [[TMP6]], ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[TMP7:%.*]] = insertelement <4 x float> poison, float [[L34]], i32 0 +; SM90-NEXT: [[TMP8:%.*]] = insertelement <4 x float> [[TMP7]], float [[L23]], i32 1 +; SM90-NEXT: [[TMP9:%.*]] = insertelement <4 x float> [[TMP8]], float [[L12]], i32 2 +; SM90-NEXT: [[TMP10:%.*]] = insertelement <4 x float> [[TMP9]], float [[L01]], i32 3 +; SM90-NEXT: store <4 x float> [[TMP10]], ptr addrspace(1) [[PTR4]], align 16 +; SM90-NEXT: ret void +; +; SM100-LABEL: define void @float32x8( +; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM100-NEXT: [[PTR0:%.*]] = getelementptr float, ptr addrspace(1) [[PTR]], i64 0 +; SM100-NEXT: [[TMP1:%.*]] = load <8 x float>, ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: [[L01:%.*]] = extractelement <8 x float> [[TMP1]], i32 0 +; SM100-NEXT: [[L12:%.*]] = extractelement <8 x float> [[TMP1]], i32 1 +; SM100-NEXT: [[L23:%.*]] = extractelement <8 x float> [[TMP1]], i32 2 +; SM100-NEXT: [[L34:%.*]] = extractelement <8 x float> [[TMP1]], i32 3 +; SM100-NEXT: [[L45:%.*]] = extractelement <8 x float> [[TMP1]], i32 4 +; SM100-NEXT: [[L56:%.*]] = extractelement <8 x float> [[TMP1]], i32 5 +; SM100-NEXT: [[L67:%.*]] = extractelement <8 x float> [[TMP1]], i32 6 +; SM100-NEXT: [[L78:%.*]] = extractelement <8 x float> [[TMP1]], i32 7 +; SM100-NEXT: [[TMP2:%.*]] = insertelement <8 x float> poison, float [[L78]], i32 0 +; SM100-NEXT: [[TMP3:%.*]] = insertelement <8 x float> [[TMP2]], float [[L67]], i32 1 +; SM100-NEXT: [[TMP4:%.*]] = insertelement <8 x float> [[TMP3]], float [[L56]], i32 2 +; SM100-NEXT: [[TMP5:%.*]] = insertelement <8 x float> [[TMP4]], float [[L45]], i32 3 +; SM100-NEXT: [[TMP6:%.*]] = insertelement <8 x float> [[TMP5]], float [[L34]], i32 4 +; SM100-NEXT: [[TMP7:%.*]] = insertelement <8 x float> [[TMP6]], float [[L23]], i32 5 +; SM100-NEXT: [[TMP8:%.*]] = insertelement <8 x float> [[TMP7]], float [[L12]], i32 6 +; SM100-NEXT: [[TMP9:%.*]] = insertelement <8 x float> [[TMP8]], float [[L01]], i32 7 +; SM100-NEXT: store <8 x float> [[TMP9]], ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: ret void +; + + %ptr0 = getelementptr float, ptr addrspace(1) %ptr, i64 0 + %ptr1 = getelementptr float, ptr addrspace(1) %ptr, i64 1 + %ptr2 = getelementptr float, ptr addrspace(1) %ptr, i64 2 + %ptr3 = getelementptr float, ptr addrspace(1) %ptr, i64 3 + %ptr4 = getelementptr float, ptr addrspace(1) %ptr, i64 4 + %ptr5 = getelementptr float, ptr addrspace(1) %ptr, i64 5 + %ptr6 = getelementptr float, ptr addrspace(1) %ptr, i64 6 + %ptr7 = getelementptr float, ptr addrspace(1) %ptr, i64 7 + + %l0 = load float, ptr addrspace(1) %ptr0, align 32 + %l1 = load float, ptr addrspace(1) %ptr1, align 1 + %l2 = load float, ptr addrspace(1) %ptr2, align 1 + %l3 = load float, ptr addrspace(1) %ptr3, align 1 + %l4 = load float, ptr addrspace(1) %ptr4, align 16 + %l5 = load float, ptr addrspace(1) %ptr5, align 1 + %l6 = load float, ptr addrspace(1) %ptr6, align 1 + %l7 = load float, ptr addrspace(1) %ptr7, align 1 + + store float %l7, ptr addrspace(1) %ptr0, align 32 + store float %l6, ptr addrspace(1) %ptr1, align 1 + store float %l5, ptr addrspace(1) %ptr2, align 1 + store float %l4, ptr addrspace(1) %ptr3, align 1 + store float %l3, ptr addrspace(1) %ptr4, align 16 + store float %l2, ptr addrspace(1) %ptr5, align 1 + store float %l1, ptr addrspace(1) %ptr6, align 1 + store float %l0, ptr addrspace(1) %ptr7, align 1 + + ret void +} + +define void @float64x4(ptr addrspace(1) %ptr) { +; SM90-LABEL: define void @float64x4( +; SM90-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM90-NEXT: [[PTR0:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 0 +; SM90-NEXT: [[PTR2:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 2 +; SM90-NEXT: [[TMP1:%.*]] = load <2 x double>, ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[L01:%.*]] = extractelement <2 x double> [[TMP1]], i32 0 +; SM90-NEXT: [[L12:%.*]] = extractelement <2 x double> [[TMP1]], i32 1 +; SM90-NEXT: [[TMP2:%.*]] = load <2 x double>, ptr addrspace(1) [[PTR2]], align 16 +; SM90-NEXT: [[L23:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 +; SM90-NEXT: [[L34:%.*]] = extractelement <2 x double> [[TMP2]], i32 1 +; SM90-NEXT: [[TMP3:%.*]] = insertelement <2 x double> poison, double [[L34]], i32 0 +; SM90-NEXT: [[TMP4:%.*]] = insertelement <2 x double> [[TMP3]], double [[L23]], i32 1 +; SM90-NEXT: store <2 x double> [[TMP4]], ptr addrspace(1) [[PTR0]], align 32 +; SM90-NEXT: [[TMP5:%.*]] = insertelement <2 x double> poison, double [[L12]], i32 0 +; SM90-NEXT: [[TMP6:%.*]] = insertelement <2 x double> [[TMP5]], double [[L01]], i32 1 +; SM90-NEXT: store <2 x double> [[TMP6]], ptr addrspace(1) [[PTR2]], align 16 +; SM90-NEXT: ret void +; +; SM100-LABEL: define void @float64x4( +; SM100-SAME: ptr addrspace(1) [[PTR:%.*]]) #[[ATTR0]] { +; SM100-NEXT: [[PTR0:%.*]] = getelementptr double, ptr addrspace(1) [[PTR]], i64 0 +; SM100-NEXT: [[TMP1:%.*]] = load <4 x double>, ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: [[L01:%.*]] = extractelement <4 x double> [[TMP1]], i32 0 +; SM100-NEXT: [[L12:%.*]] = extractelement <4 x double> [[TMP1]], i32 1 +; SM100-NEXT: [[L23:%.*]] = extractelement <4 x double> [[TMP1]], i32 2 +; SM100-NEXT: [[L34:%.*]] = extractelement <4 x double> [[TMP1]], i32 3 +; SM100-NEXT: [[TMP2:%.*]] = insertelement <4 x double> poison, double [[L34]], i32 0 +; SM100-NEXT: [[TMP3:%.*]] = insertelement <4 x double> [[TMP2]], double [[L23]], i32 1 +; SM100-NEXT: [[TMP4:%.*]] = insertelement <4 x double> [[TMP3]], double [[L12]], i32 2 +; SM100-NEXT: [[TMP5:%.*]] = insertelement <4 x double> [[TMP4]], double [[L01]], i32 3 +; SM100-NEXT: store <4 x double> [[TMP5]], ptr addrspace(1) [[PTR0]], align 32 +; SM100-NEXT: ret void +; + + %ptr0 = getelementptr double, ptr addrspace(1) %ptr, i64 0 + %ptr1 = getelementptr double, ptr addrspace(1) %ptr, i64 1 + %ptr2 = getelementptr double, ptr addrspace(1) %ptr, i64 2 + %ptr3 = getelementptr double, ptr addrspace(1) %ptr, i64 3 + + %l0 = load double, ptr addrspace(1) %ptr0, align 32 + %l1 = load double, ptr addrspace(1) %ptr1, align 1 + %l2 = load double, ptr addrspace(1) %ptr2, align 16 + %l3 = load double, ptr addrspace(1) %ptr3, align 1 + + store double %l3, ptr addrspace(1) %ptr0, align 32 + store double %l2, ptr addrspace(1) %ptr1, align 1 + store double %l1, ptr addrspace(1) %ptr2, align 16 + store double %l0, ptr addrspace(1) %ptr3, align 1 + + ret void +} + +define void @int32x8_non_global(ptr %ptr) { +; CHECK-LABEL: define void @int32x8_non_global( +; CHECK-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[PTR0:%.*]] = getelementptr i32, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[PTR4:%.*]] = getelementptr i32, ptr [[PTR]], i64 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[PTR0]], align 32 +; CHECK-NEXT: [[L01:%.*]] = extractelement <4 x i32> [[TMP1]], i32 0 +; CHECK-NEXT: [[L12:%.*]] = extractelement <4 x i32> [[TMP1]], i32 1 +; CHECK-NEXT: [[L23:%.*]] = extractelement <4 x i32> [[TMP1]], i32 2 +; CHECK-NEXT: [[L34:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3 +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x i32>, ptr [[PTR4]], align 16 +; CHECK-NEXT: [[L45:%.*]] = extractelement <4 x i32> [[TMP2]], i32 0 +; CHECK-NEXT: [[L56:%.*]] = extractelement <4 x i32> [[TMP2]], i32 1 +; CHECK-NEXT: [[L67:%.*]] = extractelement <4 x i32> [[TMP2]], i32 2 +; CHECK-NEXT: [[L78:%.*]] = extractelement <4 x i32> [[TMP2]], i32 3 +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x i32> poison, i32 [[L78]], i32 0 +; CHECK-NEXT: [[TMP4:%.*]] = insertelement <4 x i32> [[TMP3]], i32 [[L67]], i32 1 +; CHECK-NEXT: [[TMP5:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[L56]], i32 2 +; CHECK-NEXT: [[TMP6:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[L45]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP6]], ptr [[PTR0]], align 32 +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x i32> poison, i32 [[L34]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[L23]], i32 1 +; CHECK-NEXT: [[TMP9:%.*]] = insertelement <4 x i32> [[TMP8]], i32 [[L12]], i32 2 +; CHECK-NEXT: [[TMP10:%.*]] = insertelement <4 x i32> [[TMP9]], i32 [[L01]], i32 3 +; CHECK-NEXT: store <4 x i32> [[TMP10]], ptr [[PTR4]], align 16 +; CHECK-NEXT: ret void + + %ptr0 = getelementptr i32, ptr %ptr, i64 0 + %ptr1 = getelementptr i32, ptr %ptr, i64 1 + %ptr2 = getelementptr i32, ptr %ptr, i64 2 + %ptr3 = getelementptr i32, ptr %ptr, i64 3 + %ptr4 = getelementptr i32, ptr %ptr, i64 4 + %ptr5 = getelementptr i32, ptr %ptr, i64 5 + %ptr6 = getelementptr i32, ptr %ptr, i64 6 + %ptr7 = getelementptr i32, ptr %ptr, i64 7 + + %l0 = load i32, ptr %ptr0, align 32 + %l1 = load i32, ptr %ptr1, align 1 + %l2 = load i32, ptr %ptr2, align 1 + %l3 = load i32, ptr %ptr3, align 1 + %l4 = load i32, ptr %ptr4, align 16 + %l5 = load i32, ptr %ptr5, align 1 + %l6 = load i32, ptr %ptr6, align 1 + %l7 = load i32, ptr %ptr7, align 1 + + store i32 %l7, ptr %ptr0, align 32 + store i32 %l6, ptr %ptr1, align 1 + store i32 %l5, ptr %ptr2, align 1 + store i32 %l4, ptr %ptr3, align 1 + store i32 %l3, ptr %ptr4, align 16 + store i32 %l2, ptr %ptr5, align 1 + store i32 %l1, ptr %ptr6, align 1 + store i32 %l0, ptr %ptr7, align 1 + + ret void +} From 293fb1f5e4236fb5075130313ff2b290fe8fb437 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Fri, 9 May 2025 17:19:11 +0000 Subject: [PATCH 02/11] Clang format --- llvm/lib/Target/NVPTX/NVPTX.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 2468b8f43ae94..41f3832545ed9 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -196,12 +196,7 @@ enum FromType { Float, Untyped }; -enum VecType { - Scalar = 1, - V2 = 2, - V4 = 4, - V8 = 8 -}; +enum VecType { Scalar = 1, V2 = 2, V4 = 4, V8 = 8 }; } // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration From 852c7b7988ccd123492643cd8e9ac96798f4ac59 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Fri, 9 May 2025 22:35:51 +0000 Subject: [PATCH 03/11] Address reviewer feedback --- .../NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp | 14 - llvm/lib/Target/NVPTX/NVPTX.h | 1 - llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 136 +++------ llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 37 +-- llvm/lib/Target/NVPTX/NVPTXInstrInfo.td | 46 ++- llvm/lib/Target/NVPTX/NVPTXSubtarget.h | 6 +- .../Target/NVPTX/NVPTXTargetTransformInfo.cpp | 2 +- llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll | 4 +- .../load-store-256-addressing-invariant.ll | 2 +- .../NVPTX/load-store-256-addressing.ll | 2 +- .../CodeGen/NVPTX/load-store-vectors-256.ll | 4 +- .../{256-bit.ll => load-store-256-bit.ll} | 268 +++++++++--------- 12 files changed, 218 insertions(+), 304 deletions(-) rename llvm/test/Transforms/LoadStoreVectorizer/NVPTX/{256-bit.ll => load-store-256-bit.ll} (85%) diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp index ab1c3c19168af..b4616b64bad15 100644 --- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp +++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXInstPrinter.cpp @@ -311,20 +311,6 @@ void NVPTXInstPrinter::printLdStCode(const MCInst *MI, int OpNum, default: llvm_unreachable("Unknown register type"); } - } else if (Modifier == "vec") { - switch (Imm) { - case NVPTX::PTXLdStInstCode::V2: - O << ".v2"; - return; - case NVPTX::PTXLdStInstCode::V4: - O << ".v4"; - return; - case NVPTX::PTXLdStInstCode::V8: - O << ".v8"; - return; - } - // TODO: evaluate whether cases not covered by this switch are bugs - return; } llvm_unreachable(formatv("Unknown Modifier: {}", Modifier).str().c_str()); } diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index 41f3832545ed9..e7901d87f9b86 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -196,7 +196,6 @@ enum FromType { Float, Untyped }; -enum VecType { Scalar = 1, V2 = 2, V4 = 4, V8 = 8 }; } // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 12d0cdb1d486c..0280b1a442e09 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1014,11 +1014,11 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) { // Helper function template to reduce amount of boilerplate code for // opcode selection. -static std::optional -pickOpcodeForVT(MVT::SimpleValueType VT, unsigned Opcode_i8, - unsigned Opcode_i16, unsigned Opcode_i32, - std::optional Opcode_i64, unsigned Opcode_f32, - std::optional Opcode_f64) { +static std::optional pickOpcodeForVT( + MVT::SimpleValueType VT, std::optional Opcode_i8, + std::optional Opcode_i16, std::optional Opcode_i32, + std::optional Opcode_i64, std::optional Opcode_f32, + std::optional Opcode_f64) { switch (VT) { case MVT::i1: case MVT::i8: @@ -1093,7 +1093,6 @@ bool NVPTXDAGToDAGISel::tryLoad(SDNode *N) { SDValue Ops[] = {getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base, @@ -1130,6 +1129,22 @@ static bool isSubVectorPackedInI32(EVT EltVT) { return Isv2x16VT(EltVT) || EltVT == MVT::v4i8; } +static unsigned getLoadStoreVectorNumElts(SDNode *N) { + switch (N->getOpcode()) { + case NVPTXISD::LoadV2: + case NVPTXISD::StoreV2: + return 2; + case NVPTXISD::LoadV4: + case NVPTXISD::StoreV4: + return 4; + case NVPTXISD::LoadV8: + case NVPTXISD::StoreV8: + return 8; + default: + llvm_unreachable("Unexpected opcode"); + } +} + bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { MemSDNode *MemSD = cast(N); const EVT MemEVT = MemSD->getMemoryVT(); @@ -1161,26 +1176,7 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { ? NVPTX::PTXLdStInstCode::Signed : NVPTX::PTXLdStInstCode::Untyped; - unsigned VecType; - unsigned FromTypeWidth; - switch (N->getOpcode()) { - case NVPTXISD::LoadV2: - FromTypeWidth = TotalWidth / 2; - VecType = NVPTX::PTXLdStInstCode::V2; - break; - case NVPTXISD::LoadV4: - FromTypeWidth = TotalWidth / 4; - VecType = NVPTX::PTXLdStInstCode::V4; - break; - case NVPTXISD::LoadV8: - if (!Subtarget->has256BitMaskedLoadStore()) - return false; - FromTypeWidth = TotalWidth / 8; - VecType = NVPTX::PTXLdStInstCode::V8; - break; - default: - return false; - } + unsigned FromTypeWidth = TotalWidth / getLoadStoreVectorNumElts(N); if (isSubVectorPackedInI32(EltVT)) { assert(ExtensionType == ISD::NON_EXTLOAD); @@ -1195,7 +1191,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { SDValue Ops[] = {getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(VecType, DL), getI32Imm(FromType, DL), getI32Imm(FromTypeWidth, DL), Base, @@ -1219,16 +1214,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { NVPTX::LDV_f32_v4, NVPTX::LDV_f64_v4); break; case NVPTXISD::LoadV8: - switch (EltVT.getSimpleVT().SimpleTy) { - case MVT::i32: - Opcode = NVPTX::LDV_i32_v8; - break; - case MVT::f32: - Opcode = NVPTX::LDV_f32_v8; - break; - default: - return false; - } + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, std::nullopt, + std::nullopt, NVPTX::LDV_i32_v8, std::nullopt, + NVPTX::LDV_f32_v8, std::nullopt); break; } if (!Opcode) @@ -1334,22 +1322,10 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt); break; case NVPTXISD::LoadV8: - switch (EltVT.getSimpleVT().SimpleTy) { - case MVT::i32: - Opcode = NVPTX::INT_PTX_LDG_G_v8i32_ELE; - break; - case MVT::f32: - Opcode = NVPTX::INT_PTX_LDG_G_v8f32_ELE; - break; - case MVT::v2i16: - case MVT::v2f16: - case MVT::v2bf16: - case MVT::v4i8: - Opcode = NVPTX::INT_PTX_LDG_G_v8i32_ELE; - break; - default: - return false; - } + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, std::nullopt, std::nullopt, + NVPTX::INT_PTX_LDG_G_v8i32_ELE, std::nullopt, NVPTX::INT_PTX_LDG_G_v8f32_ELE, + std::nullopt); break; } if (!Opcode) @@ -1435,7 +1411,6 @@ bool NVPTXDAGToDAGISel::tryStore(SDNode *N) { getI32Imm(Ordering, DL), getI32Imm(Scope, DL), getI32Imm(CodeAddrSpace, DL), - getI32Imm(NVPTX::PTXLdStInstCode::Scalar, DL), getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL), getI32Imm(ToTypeWidth, DL), Base, @@ -1483,38 +1458,12 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { // - for integer type, always use 'u' const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits(); - SmallVector Ops; - SDValue N2; - unsigned VecType; - unsigned ToTypeWidth; - - switch (N->getOpcode()) { - case NVPTXISD::StoreV2: - VecType = NVPTX::PTXLdStInstCode::V2; - Ops.append({N->getOperand(1), N->getOperand(2)}); - N2 = N->getOperand(3); - ToTypeWidth = TotalWidth / 2; - break; - case NVPTXISD::StoreV4: - VecType = NVPTX::PTXLdStInstCode::V4; - Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), - N->getOperand(4)}); - N2 = N->getOperand(5); - ToTypeWidth = TotalWidth / 4; - break; - case NVPTXISD::StoreV8: - if (!Subtarget->has256BitMaskedLoadStore()) - return false; - VecType = NVPTX::PTXLdStInstCode::V8; - Ops.append({N->getOperand(1), N->getOperand(2), N->getOperand(3), - N->getOperand(4), N->getOperand(5), N->getOperand(6), - N->getOperand(7), N->getOperand(8)}); - N2 = N->getOperand(9); - ToTypeWidth = TotalWidth / 8; - break; - default: - return false; - } + unsigned NumElts = getLoadStoreVectorNumElts(N); + SmallVector Ops; + for (unsigned I : llvm::seq(NumElts)) + Ops.append({N->getOperand(I + 1)}); + SDValue N2 = N->getOperand(NumElts + 1); + unsigned ToTypeWidth = TotalWidth / NumElts; if (isSubVectorPackedInI32(EltVT)) { EltVT = MVT::i32; @@ -1527,7 +1476,7 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { SelectADDR(N2, Base, Offset); Ops.append({getI32Imm(Ordering, DL), getI32Imm(Scope, DL), - getI32Imm(CodeAddrSpace, DL), getI32Imm(VecType, DL), + getI32Imm(CodeAddrSpace, DL), getI32Imm(NVPTX::PTXLdStInstCode::Untyped, DL), getI32Imm(ToTypeWidth, DL), Base, Offset, Chain}); @@ -1548,16 +1497,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v4, NVPTX::STV_f64_v4); break; case NVPTXISD::StoreV8: - switch (EltVT.getSimpleVT().SimpleTy) { - case MVT::i32: - Opcode = NVPTX::STV_i32_v8; - break; - case MVT::f32: - Opcode = NVPTX::STV_f32_v8; - break; - default: - return false; - } + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, std::nullopt, + std::nullopt, NVPTX::STV_i32_v8, std::nullopt, + NVPTX::STV_f32_v8, std::nullopt); break; } diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp index d7883b5d526aa..75c6a2121370f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -3229,12 +3229,8 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const { if (ValVT != MemVT) return SDValue(); - // 256-bit vectors are only allowed iff the address is global - // and the target supports 256-bit loads/stores - unsigned AddrSpace = cast(N)->getAddressSpace(); - bool CanLowerTo256Bit = - AddrSpace == ADDRESS_SPACE_GLOBAL && STI.has256BitMaskedLoadStore(); - const auto NumEltsAndEltVT = getVectorLoweringShape(ValVT, CanLowerTo256Bit); + const auto NumEltsAndEltVT = getVectorLoweringShape( + ValVT, STI.has256BitVectorLoadStore(N->getAddressSpace())); if (!NumEltsAndEltVT) return SDValue(); const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); @@ -5802,7 +5798,7 @@ static void ReplaceBITCAST(SDNode *Node, SelectionDAG &DAG, /// ReplaceVectorLoad - Convert vector loads into multi-output scalar loads. static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, SmallVectorImpl &Results, - bool TargetHas256BitVectorLoadStore) { + const NVPTXSubtarget &STI) { LoadSDNode *LD = cast(N); const EVT ResVT = LD->getValueType(0); const EVT MemVT = LD->getMemoryVT(); @@ -5812,12 +5808,8 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, if (ResVT != MemVT) return; - // 256-bit vectors are only allowed iff the address is global - // and the target supports 256-bit loads/stores - unsigned AddrSpace = cast(N)->getAddressSpace(); - bool CanLowerTo256Bit = - AddrSpace == ADDRESS_SPACE_GLOBAL && TargetHas256BitVectorLoadStore; - const auto NumEltsAndEltVT = getVectorLoweringShape(ResVT, CanLowerTo256Bit); + const auto NumEltsAndEltVT = getVectorLoweringShape( + ResVT, STI.has256BitVectorLoadStore(LD->getAddressSpace())); if (!NumEltsAndEltVT) return; const auto [NumElts, EltVT] = NumEltsAndEltVT.value(); @@ -5840,28 +5832,23 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG, const MVT LoadEltVT = (EltVT.getSizeInBits() < 16) ? MVT::i16 : EltVT; unsigned Opcode; - SDVTList LdResVTs; switch (NumElts) { default: return; case 2: Opcode = NVPTXISD::LoadV2; - LdResVTs = DAG.getVTList(LoadEltVT, LoadEltVT, MVT::Other); break; - case 4: { + case 4: Opcode = NVPTXISD::LoadV4; - LdResVTs = - DAG.getVTList({LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, MVT::Other}); break; - } - case 8: { + case 8: Opcode = NVPTXISD::LoadV8; - EVT ListVTs[] = {LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, LoadEltVT, - LoadEltVT, LoadEltVT, LoadEltVT, MVT::Other}; - LdResVTs = DAG.getVTList(ListVTs); break; } - } + auto ListVTs = SmallVector(NumElts, LoadEltVT); + ListVTs.push_back(MVT::Other); + SDVTList LdResVTs = DAG.getVTList(ListVTs); + SDLoc DL(LD); // Copy regular operands @@ -6133,7 +6120,7 @@ void NVPTXTargetLowering::ReplaceNodeResults( ReplaceBITCAST(N, DAG, Results); return; case ISD::LOAD: - ReplaceLoadVector(N, DAG, Results, STI.has256BitMaskedLoadStore()); + ReplaceLoadVector(N, DAG, Results, STI); return; case ISD::INTRINSIC_W_CHAIN: ReplaceINTRINSIC_W_CHAIN(N, DAG, Results); diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td index 1d2074b804f89..118f7d660a333 100644 --- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -2391,9 +2391,9 @@ foreach vt = [v2f16, v2bf16, v2i16, v4i8] in { class LD : NVPTXInst< (outs regclass:$dst), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$fromWidth " "\t$dst, [$addr];", []>; let mayLoad=1, hasSideEffects=0 in { @@ -2409,8 +2409,8 @@ class ST : NVPTXInst< (outs), (ins regclass:$src, LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, - LdStCode:$Vec, LdStCode:$Sign, i32imm:$toWidth, ADDR:$addr), - "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$toWidth" + LdStCode:$Sign, i32imm:$toWidth, ADDR:$addr), + "st${sem:sem}${scope:scope}${addsp:addsp}.${Sign:sign}$toWidth" " \t[$addr], $src;", []>; let mayStore=1, hasSideEffects=0 in { @@ -2428,33 +2428,32 @@ let mayStore=1, hasSideEffects=0 in { multiclass LD_VEC { def _v2 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2}}, [$addr];", []>; def _v4 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4}}, [$addr];", []>; - if support_v8 then { + if support_v8 then def _v8 : NVPTXInst< (outs regclass:$dst1, regclass:$dst2, regclass:$dst3, regclass:$dst4, regclass:$dst5, regclass:$dst6, regclass:$dst7, regclass:$dst8), - (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + (ins LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "ld${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "ld${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth " "\t{{$dst1, $dst2, $dst3, $dst4, $dst5, $dst6, $dst7, $dst8}}, " "[$addr];", []>; - } } let mayLoad=1, hasSideEffects=0 in { defm LDV_i8 : LD_VEC; defm LDV_i16 : LD_VEC; - defm LDV_i32 : LD_VEC; + defm LDV_i32 : LD_VEC; defm LDV_i64 : LD_VEC; - defm LDV_f32 : LD_VEC; + defm LDV_f32 : LD_VEC; defm LDV_f64 : LD_VEC; } @@ -2462,36 +2461,35 @@ multiclass ST_VEC { def _v2 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, LdStCode:$sem, LdStCode:$scope, - LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, i32imm:$fromWidth, + LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${scope:scope}${addsp:addsp}.v2.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2}};", []>; def _v4 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${scope:scope}${addsp:addsp}.v4.${Sign:sign}$fromWidth " "\t[$addr], {{$src1, $src2, $src3, $src4}};", []>; - if support_v8 then { + if support_v8 then def _v8 : NVPTXInst< (outs), (ins regclass:$src1, regclass:$src2, regclass:$src3, regclass:$src4, regclass:$src5, regclass:$src6, regclass:$src7, regclass:$src8, - LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Vec, LdStCode:$Sign, + LdStCode:$sem, LdStCode:$scope, LdStCode:$addsp, LdStCode:$Sign, i32imm:$fromWidth, ADDR:$addr), - "st${sem:sem}${scope:scope}${addsp:addsp}${Vec:vec}.${Sign:sign}$fromWidth " + "st${sem:sem}${scope:scope}${addsp:addsp}.v8.${Sign:sign}$fromWidth " "\t[$addr], " "{{$src1, $src2, $src3, $src4, $src5, $src6, $src7, $src8}};", []>; - } } let mayStore=1, hasSideEffects=0 in { defm STV_i8 : ST_VEC; defm STV_i16 : ST_VEC; - defm STV_i32 : ST_VEC; + defm STV_i32 : ST_VEC; defm STV_i64 : ST_VEC; - defm STV_f32 : ST_VEC; + defm STV_f32 : ST_VEC; defm STV_f64 : ST_VEC; } diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h index 5552bba728160..5136b1ee28502 100644 --- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.h +++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.h @@ -20,6 +20,7 @@ #include "NVPTXRegisterInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" +#include "llvm/Support/NVPTXAddrSpace.h" #include #define GET_SUBTARGETINFO_HEADER @@ -72,8 +73,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo { const SelectionDAGTargetInfo *getSelectionDAGInfo() const override; - bool has256BitMaskedLoadStore() const { - return SmVersion >= 100 && PTXVersion >= 88; + bool has256BitVectorLoadStore(unsigned AS) const { + return SmVersion >= 100 && PTXVersion >= 88 && + AS == NVPTXAS::ADDRESS_SPACE_GLOBAL; } bool hasAtomAddF64() const { return SmVersion >= 60; } bool hasAtomScope() const { return SmVersion >= 60; } diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp index b1484111f1d2d..3ae2d9d5181a3 100644 --- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp @@ -593,7 +593,7 @@ Value *NVPTXTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, unsigned NVPTXTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const { // 256 bit loads/stores are currently only supported for global address space - if (AddrSpace == ADDRESS_SPACE_GLOBAL && ST->has256BitMaskedLoadStore()) + if (ST->has256BitVectorLoadStore(AddrSpace)) return 256; return 128; } diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index f4abcb37aa894..8381b3b7dbea4 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 -verify-machineinstrs | FileCheck %s -check-prefixes=SM90 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 -verify-machineinstrs | FileCheck %s -check-prefixes=SM100 -; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.9 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; For 256-bit vectors, check that invariant loads from the ; global addrspace are lowered to ld.global.nc. diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index a0bfbef53020f..0d051848c0d06 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; In this test, we check that all the addressing modes are lowered correctly ; for 256-bit invariant loads, which get lowered to ld.global.nc diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index 55b71ccfac5a2..4543f75dfa1eb 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=PTX -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; In this test, we check that all the addressing modes are lowered correctly, ; addr can be any of the following: diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index e26a1a5617d2d..2d19c308e4f3a 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | FileCheck -check-prefixes=CHECK,SM90 %s -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} +; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx87 | %ptxas-verify -arch=sm_90 %} ; RUN: llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | FileCheck %s -check-prefixes=CHECK,SM100 -; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} +; RUN: %if ptxas-12.9 %{ llc < %s -march=nvptx64 -mcpu=sm_100 -mattr=+ptx88 | %ptxas-verify -arch=sm_100 %} ; This test is based on load-store-vectors.ll, ; and contains testing for lowering 256-bit vector loads/stores diff --git a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/load-store-256-bit.ll similarity index 85% rename from llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll rename to llvm/test/Transforms/LoadStoreVectorizer/NVPTX/load-store-256-bit.ll index 9034c96c6a52d..649347d455c67 100644 --- a/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/256-bit.ll +++ b/llvm/test/Transforms/LoadStoreVectorizer/NVPTX/load-store-256-bit.ll @@ -197,70 +197,70 @@ define void @int8x32(ptr addrspace(1) %ptr) { %ptr1f = getelementptr i8, ptr addrspace(1) %ptr, i64 31 %l0 = load i8, ptr addrspace(1) %ptr0, align 32 - %l1 = load i8, ptr addrspace(1) %ptr1, align 1 - %l2 = load i8, ptr addrspace(1) %ptr2, align 1 - %l3 = load i8, ptr addrspace(1) %ptr3, align 1 - %l4 = load i8, ptr addrspace(1) %ptr4, align 1 - %l5 = load i8, ptr addrspace(1) %ptr5, align 1 - %l6 = load i8, ptr addrspace(1) %ptr6, align 1 - %l7 = load i8, ptr addrspace(1) %ptr7, align 1 - %l8 = load i8, ptr addrspace(1) %ptr8, align 1 - %l9 = load i8, ptr addrspace(1) %ptr9, align 1 - %la = load i8, ptr addrspace(1) %ptra, align 1 - %lb = load i8, ptr addrspace(1) %ptrb, align 1 - %lc = load i8, ptr addrspace(1) %ptrc, align 1 - %ld = load i8, ptr addrspace(1) %ptrd, align 1 - %le = load i8, ptr addrspace(1) %ptre, align 1 - %lf = load i8, ptr addrspace(1) %ptrf, align 1 + %l1 = load i8, ptr addrspace(1) %ptr1 + %l2 = load i8, ptr addrspace(1) %ptr2 + %l3 = load i8, ptr addrspace(1) %ptr3 + %l4 = load i8, ptr addrspace(1) %ptr4 + %l5 = load i8, ptr addrspace(1) %ptr5 + %l6 = load i8, ptr addrspace(1) %ptr6 + %l7 = load i8, ptr addrspace(1) %ptr7 + %l8 = load i8, ptr addrspace(1) %ptr8 + %l9 = load i8, ptr addrspace(1) %ptr9 + %la = load i8, ptr addrspace(1) %ptra + %lb = load i8, ptr addrspace(1) %ptrb + %lc = load i8, ptr addrspace(1) %ptrc + %ld = load i8, ptr addrspace(1) %ptrd + %le = load i8, ptr addrspace(1) %ptre + %lf = load i8, ptr addrspace(1) %ptrf %l10 = load i8, ptr addrspace(1) %ptr10, align 16 - %l11 = load i8, ptr addrspace(1) %ptr11, align 1 - %l12 = load i8, ptr addrspace(1) %ptr12, align 1 - %l13 = load i8, ptr addrspace(1) %ptr13, align 1 - %l14 = load i8, ptr addrspace(1) %ptr14, align 1 - %l15 = load i8, ptr addrspace(1) %ptr15, align 1 - %l16 = load i8, ptr addrspace(1) %ptr16, align 1 - %l17 = load i8, ptr addrspace(1) %ptr17, align 1 - %l18 = load i8, ptr addrspace(1) %ptr18, align 1 - %l19 = load i8, ptr addrspace(1) %ptr19, align 1 - %l1a = load i8, ptr addrspace(1) %ptr1a, align 1 - %l1b = load i8, ptr addrspace(1) %ptr1b, align 1 - %l1c = load i8, ptr addrspace(1) %ptr1c, align 1 - %l1d = load i8, ptr addrspace(1) %ptr1d, align 1 - %l1e = load i8, ptr addrspace(1) %ptr1e, align 1 - %l1f = load i8, ptr addrspace(1) %ptr1f, align 1 - - store i8 %lf, ptr addrspace(1) %ptrc, align 1 - store i8 %le, ptr addrspace(1) %ptrd, align 1 - store i8 %ld, ptr addrspace(1) %ptre, align 1 - store i8 %lc, ptr addrspace(1) %ptrf, align 1 + %l11 = load i8, ptr addrspace(1) %ptr11 + %l12 = load i8, ptr addrspace(1) %ptr12 + %l13 = load i8, ptr addrspace(1) %ptr13 + %l14 = load i8, ptr addrspace(1) %ptr14 + %l15 = load i8, ptr addrspace(1) %ptr15 + %l16 = load i8, ptr addrspace(1) %ptr16 + %l17 = load i8, ptr addrspace(1) %ptr17 + %l18 = load i8, ptr addrspace(1) %ptr18 + %l19 = load i8, ptr addrspace(1) %ptr19 + %l1a = load i8, ptr addrspace(1) %ptr1a + %l1b = load i8, ptr addrspace(1) %ptr1b + %l1c = load i8, ptr addrspace(1) %ptr1c + %l1d = load i8, ptr addrspace(1) %ptr1d + %l1e = load i8, ptr addrspace(1) %ptr1e + %l1f = load i8, ptr addrspace(1) %ptr1f + + store i8 %lf, ptr addrspace(1) %ptrc + store i8 %le, ptr addrspace(1) %ptrd + store i8 %ld, ptr addrspace(1) %ptre + store i8 %lc, ptr addrspace(1) %ptrf store i8 %lb, ptr addrspace(1) %ptr0, align 32 - store i8 %la, ptr addrspace(1) %ptr1, align 1 - store i8 %l9, ptr addrspace(1) %ptr2, align 1 - store i8 %l8, ptr addrspace(1) %ptr3, align 1 - store i8 %l7, ptr addrspace(1) %ptr4, align 1 - store i8 %l6, ptr addrspace(1) %ptr5, align 1 - store i8 %l5, ptr addrspace(1) %ptr6, align 1 - store i8 %l4, ptr addrspace(1) %ptr7, align 1 - store i8 %l3, ptr addrspace(1) %ptr8, align 1 - store i8 %l2, ptr addrspace(1) %ptr9, align 1 - store i8 %l1, ptr addrspace(1) %ptra, align 1 - store i8 %l0, ptr addrspace(1) %ptrb, align 1 - store i8 %l1f, ptr addrspace(1) %ptr1c, align 1 - store i8 %l1e, ptr addrspace(1) %ptr1d, align 1 - store i8 %l1d, ptr addrspace(1) %ptr1e, align 1 - store i8 %l1c, ptr addrspace(1) %ptr1f, align 1 + store i8 %la, ptr addrspace(1) %ptr1 + store i8 %l9, ptr addrspace(1) %ptr2 + store i8 %l8, ptr addrspace(1) %ptr3 + store i8 %l7, ptr addrspace(1) %ptr4 + store i8 %l6, ptr addrspace(1) %ptr5 + store i8 %l5, ptr addrspace(1) %ptr6 + store i8 %l4, ptr addrspace(1) %ptr7 + store i8 %l3, ptr addrspace(1) %ptr8 + store i8 %l2, ptr addrspace(1) %ptr9 + store i8 %l1, ptr addrspace(1) %ptra + store i8 %l0, ptr addrspace(1) %ptrb + store i8 %l1f, ptr addrspace(1) %ptr1c + store i8 %l1e, ptr addrspace(1) %ptr1d + store i8 %l1d, ptr addrspace(1) %ptr1e + store i8 %l1c, ptr addrspace(1) %ptr1f store i8 %l1b, ptr addrspace(1) %ptr10, align 16 - store i8 %l1a, ptr addrspace(1) %ptr11, align 1 - store i8 %l19, ptr addrspace(1) %ptr12, align 1 - store i8 %l18, ptr addrspace(1) %ptr13, align 1 - store i8 %l17, ptr addrspace(1) %ptr14, align 1 - store i8 %l16, ptr addrspace(1) %ptr15, align 1 - store i8 %l15, ptr addrspace(1) %ptr16, align 1 - store i8 %l14, ptr addrspace(1) %ptr17, align 1 - store i8 %l13, ptr addrspace(1) %ptr18, align 1 - store i8 %l12, ptr addrspace(1) %ptr19, align 1 - store i8 %l11, ptr addrspace(1) %ptr1a, align 1 - store i8 %l10, ptr addrspace(1) %ptr1b, align 1 + store i8 %l1a, ptr addrspace(1) %ptr11 + store i8 %l19, ptr addrspace(1) %ptr12 + store i8 %l18, ptr addrspace(1) %ptr13 + store i8 %l17, ptr addrspace(1) %ptr14 + store i8 %l16, ptr addrspace(1) %ptr15 + store i8 %l15, ptr addrspace(1) %ptr16 + store i8 %l14, ptr addrspace(1) %ptr17 + store i8 %l13, ptr addrspace(1) %ptr18 + store i8 %l12, ptr addrspace(1) %ptr19 + store i8 %l11, ptr addrspace(1) %ptr1a + store i8 %l10, ptr addrspace(1) %ptr1b ret void } @@ -366,38 +366,38 @@ define void @int16x16(ptr addrspace(1) %ptr) { %ptrf = getelementptr i16, ptr addrspace(1) %ptr, i64 15 %l0 = load i16, ptr addrspace(1) %ptr0, align 32 - %l1 = load i16, ptr addrspace(1) %ptr1, align 1 - %l2 = load i16, ptr addrspace(1) %ptr2, align 1 - %l3 = load i16, ptr addrspace(1) %ptr3, align 1 - %l4 = load i16, ptr addrspace(1) %ptr4, align 1 - %l5 = load i16, ptr addrspace(1) %ptr5, align 1 - %l6 = load i16, ptr addrspace(1) %ptr6, align 1 - %l7 = load i16, ptr addrspace(1) %ptr7, align 1 + %l1 = load i16, ptr addrspace(1) %ptr1 + %l2 = load i16, ptr addrspace(1) %ptr2 + %l3 = load i16, ptr addrspace(1) %ptr3 + %l4 = load i16, ptr addrspace(1) %ptr4 + %l5 = load i16, ptr addrspace(1) %ptr5 + %l6 = load i16, ptr addrspace(1) %ptr6 + %l7 = load i16, ptr addrspace(1) %ptr7 %l8 = load i16, ptr addrspace(1) %ptr8, align 16 - %l9 = load i16, ptr addrspace(1) %ptr9, align 1 - %la = load i16, ptr addrspace(1) %ptra, align 1 - %lb = load i16, ptr addrspace(1) %ptrb, align 1 - %lc = load i16, ptr addrspace(1) %ptrc, align 1 - %ld = load i16, ptr addrspace(1) %ptrd, align 1 - %le = load i16, ptr addrspace(1) %ptre, align 1 - %lf = load i16, ptr addrspace(1) %ptrf, align 1 - - store i16 %lf, ptr addrspace(1) %ptrc, align 1 - store i16 %le, ptr addrspace(1) %ptrd, align 1 - store i16 %ld, ptr addrspace(1) %ptre, align 1 - store i16 %lc, ptr addrspace(1) %ptrf, align 1 + %l9 = load i16, ptr addrspace(1) %ptr9 + %la = load i16, ptr addrspace(1) %ptra + %lb = load i16, ptr addrspace(1) %ptrb + %lc = load i16, ptr addrspace(1) %ptrc + %ld = load i16, ptr addrspace(1) %ptrd + %le = load i16, ptr addrspace(1) %ptre + %lf = load i16, ptr addrspace(1) %ptrf + + store i16 %lf, ptr addrspace(1) %ptrc + store i16 %le, ptr addrspace(1) %ptrd + store i16 %ld, ptr addrspace(1) %ptre + store i16 %lc, ptr addrspace(1) %ptrf store i16 %lb, ptr addrspace(1) %ptr0, align 32 - store i16 %la, ptr addrspace(1) %ptr1, align 1 - store i16 %l9, ptr addrspace(1) %ptr2, align 1 - store i16 %l8, ptr addrspace(1) %ptr3, align 1 - store i16 %l7, ptr addrspace(1) %ptr4, align 1 - store i16 %l6, ptr addrspace(1) %ptr5, align 1 - store i16 %l5, ptr addrspace(1) %ptr6, align 1 - store i16 %l4, ptr addrspace(1) %ptr7, align 1 + store i16 %la, ptr addrspace(1) %ptr1 + store i16 %l9, ptr addrspace(1) %ptr2 + store i16 %l8, ptr addrspace(1) %ptr3 + store i16 %l7, ptr addrspace(1) %ptr4 + store i16 %l6, ptr addrspace(1) %ptr5 + store i16 %l5, ptr addrspace(1) %ptr6 + store i16 %l4, ptr addrspace(1) %ptr7 store i16 %l3, ptr addrspace(1) %ptr8, align 16 - store i16 %l2, ptr addrspace(1) %ptr9, align 1 - store i16 %l1, ptr addrspace(1) %ptra, align 1 - store i16 %l0, ptr addrspace(1) %ptrb, align 1 + store i16 %l2, ptr addrspace(1) %ptr9 + store i16 %l1, ptr addrspace(1) %ptra + store i16 %l0, ptr addrspace(1) %ptrb ret void } @@ -463,22 +463,22 @@ define void @int32x8(ptr addrspace(1) %ptr) { %ptr7 = getelementptr i32, ptr addrspace(1) %ptr, i64 7 %l0 = load i32, ptr addrspace(1) %ptr0, align 32 - %l1 = load i32, ptr addrspace(1) %ptr1, align 1 - %l2 = load i32, ptr addrspace(1) %ptr2, align 1 - %l3 = load i32, ptr addrspace(1) %ptr3, align 1 + %l1 = load i32, ptr addrspace(1) %ptr1 + %l2 = load i32, ptr addrspace(1) %ptr2 + %l3 = load i32, ptr addrspace(1) %ptr3 %l4 = load i32, ptr addrspace(1) %ptr4, align 16 - %l5 = load i32, ptr addrspace(1) %ptr5, align 1 - %l6 = load i32, ptr addrspace(1) %ptr6, align 1 - %l7 = load i32, ptr addrspace(1) %ptr7, align 1 + %l5 = load i32, ptr addrspace(1) %ptr5 + %l6 = load i32, ptr addrspace(1) %ptr6 + %l7 = load i32, ptr addrspace(1) %ptr7 store i32 %l7, ptr addrspace(1) %ptr0, align 32 - store i32 %l6, ptr addrspace(1) %ptr1, align 1 - store i32 %l5, ptr addrspace(1) %ptr2, align 1 - store i32 %l4, ptr addrspace(1) %ptr3, align 1 + store i32 %l6, ptr addrspace(1) %ptr1 + store i32 %l5, ptr addrspace(1) %ptr2 + store i32 %l4, ptr addrspace(1) %ptr3 store i32 %l3, ptr addrspace(1) %ptr4, align 16 - store i32 %l2, ptr addrspace(1) %ptr5, align 1 - store i32 %l1, ptr addrspace(1) %ptr6, align 1 - store i32 %l0, ptr addrspace(1) %ptr7, align 1 + store i32 %l2, ptr addrspace(1) %ptr5 + store i32 %l1, ptr addrspace(1) %ptr6 + store i32 %l0, ptr addrspace(1) %ptr7 ret void } @@ -524,14 +524,14 @@ define void @int64x4(ptr addrspace(1) %ptr) { %ptr3 = getelementptr i64, ptr addrspace(1) %ptr, i64 3 %l0 = load i64, ptr addrspace(1) %ptr0, align 32 - %l1 = load i64, ptr addrspace(1) %ptr1, align 1 + %l1 = load i64, ptr addrspace(1) %ptr1 %l2 = load i64, ptr addrspace(1) %ptr2, align 16 - %l3 = load i64, ptr addrspace(1) %ptr3, align 1 + %l3 = load i64, ptr addrspace(1) %ptr3 store i64 %l3, ptr addrspace(1) %ptr0, align 32 - store i64 %l2, ptr addrspace(1) %ptr1, align 1 + store i64 %l2, ptr addrspace(1) %ptr1 store i64 %l1, ptr addrspace(1) %ptr2, align 16 - store i64 %l0, ptr addrspace(1) %ptr3, align 1 + store i64 %l0, ptr addrspace(1) %ptr3 ret void } @@ -597,22 +597,22 @@ define void @float32x8(ptr addrspace(1) %ptr) { %ptr7 = getelementptr float, ptr addrspace(1) %ptr, i64 7 %l0 = load float, ptr addrspace(1) %ptr0, align 32 - %l1 = load float, ptr addrspace(1) %ptr1, align 1 - %l2 = load float, ptr addrspace(1) %ptr2, align 1 - %l3 = load float, ptr addrspace(1) %ptr3, align 1 + %l1 = load float, ptr addrspace(1) %ptr1 + %l2 = load float, ptr addrspace(1) %ptr2 + %l3 = load float, ptr addrspace(1) %ptr3 %l4 = load float, ptr addrspace(1) %ptr4, align 16 - %l5 = load float, ptr addrspace(1) %ptr5, align 1 - %l6 = load float, ptr addrspace(1) %ptr6, align 1 - %l7 = load float, ptr addrspace(1) %ptr7, align 1 + %l5 = load float, ptr addrspace(1) %ptr5 + %l6 = load float, ptr addrspace(1) %ptr6 + %l7 = load float, ptr addrspace(1) %ptr7 store float %l7, ptr addrspace(1) %ptr0, align 32 - store float %l6, ptr addrspace(1) %ptr1, align 1 - store float %l5, ptr addrspace(1) %ptr2, align 1 - store float %l4, ptr addrspace(1) %ptr3, align 1 + store float %l6, ptr addrspace(1) %ptr1 + store float %l5, ptr addrspace(1) %ptr2 + store float %l4, ptr addrspace(1) %ptr3 store float %l3, ptr addrspace(1) %ptr4, align 16 - store float %l2, ptr addrspace(1) %ptr5, align 1 - store float %l1, ptr addrspace(1) %ptr6, align 1 - store float %l0, ptr addrspace(1) %ptr7, align 1 + store float %l2, ptr addrspace(1) %ptr5 + store float %l1, ptr addrspace(1) %ptr6 + store float %l0, ptr addrspace(1) %ptr7 ret void } @@ -658,14 +658,14 @@ define void @float64x4(ptr addrspace(1) %ptr) { %ptr3 = getelementptr double, ptr addrspace(1) %ptr, i64 3 %l0 = load double, ptr addrspace(1) %ptr0, align 32 - %l1 = load double, ptr addrspace(1) %ptr1, align 1 + %l1 = load double, ptr addrspace(1) %ptr1 %l2 = load double, ptr addrspace(1) %ptr2, align 16 - %l3 = load double, ptr addrspace(1) %ptr3, align 1 + %l3 = load double, ptr addrspace(1) %ptr3 store double %l3, ptr addrspace(1) %ptr0, align 32 - store double %l2, ptr addrspace(1) %ptr1, align 1 + store double %l2, ptr addrspace(1) %ptr1 store double %l1, ptr addrspace(1) %ptr2, align 16 - store double %l0, ptr addrspace(1) %ptr3, align 1 + store double %l0, ptr addrspace(1) %ptr3 ret void } @@ -707,22 +707,22 @@ define void @int32x8_non_global(ptr %ptr) { %ptr7 = getelementptr i32, ptr %ptr, i64 7 %l0 = load i32, ptr %ptr0, align 32 - %l1 = load i32, ptr %ptr1, align 1 - %l2 = load i32, ptr %ptr2, align 1 - %l3 = load i32, ptr %ptr3, align 1 + %l1 = load i32, ptr %ptr1 + %l2 = load i32, ptr %ptr2 + %l3 = load i32, ptr %ptr3 %l4 = load i32, ptr %ptr4, align 16 - %l5 = load i32, ptr %ptr5, align 1 - %l6 = load i32, ptr %ptr6, align 1 - %l7 = load i32, ptr %ptr7, align 1 + %l5 = load i32, ptr %ptr5 + %l6 = load i32, ptr %ptr6 + %l7 = load i32, ptr %ptr7 store i32 %l7, ptr %ptr0, align 32 - store i32 %l6, ptr %ptr1, align 1 - store i32 %l5, ptr %ptr2, align 1 - store i32 %l4, ptr %ptr3, align 1 + store i32 %l6, ptr %ptr1 + store i32 %l5, ptr %ptr2 + store i32 %l4, ptr %ptr3 store i32 %l3, ptr %ptr4, align 16 - store i32 %l2, ptr %ptr5, align 1 - store i32 %l1, ptr %ptr6, align 1 - store i32 %l0, ptr %ptr7, align 1 + store i32 %l2, ptr %ptr5 + store i32 %l1, ptr %ptr6 + store i32 %l0, ptr %ptr7 ret void } From 25636fe843ea63d237fccb539aa58aa3ffca95d9 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Fri, 9 May 2025 22:38:03 +0000 Subject: [PATCH 04/11] Clang format --- llvm/lib/Target/NVPTX/NVPTX.h | 7 +------ llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 8 ++++---- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h index e7901d87f9b86..6c0d0e39ad160 100644 --- a/llvm/lib/Target/NVPTX/NVPTX.h +++ b/llvm/lib/Target/NVPTX/NVPTX.h @@ -190,12 +190,7 @@ enum AddressSpace : AddressSpaceUnderlyingType { }; namespace PTXLdStInstCode { -enum FromType { - Unsigned = 0, - Signed, - Float, - Untyped -}; +enum FromType { Unsigned = 0, Signed, Float, Untyped }; } // namespace PTXLdStInstCode /// PTXCvtMode - Conversion code enumeration diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 0280b1a442e09..5613f14f8c25f 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1322,10 +1322,10 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, std::nullopt, std::nullopt, - NVPTX::INT_PTX_LDG_G_v8i32_ELE, std::nullopt, NVPTX::INT_PTX_LDG_G_v8f32_ELE, - std::nullopt); + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, std::nullopt, + std::nullopt, NVPTX::INT_PTX_LDG_G_v8i32_ELE, + std::nullopt, NVPTX::INT_PTX_LDG_G_v8f32_ELE, + std::nullopt); break; } if (!Opcode) From 7a737f9cb3208021e9442a0810075493de67f9dd Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Mon, 12 May 2025 15:56:47 +0000 Subject: [PATCH 05/11] Replace std::nullopt with empty initializer list: --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 34 ++++++++++----------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 5613f14f8c25f..c5ddfbfc8ec20 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1214,9 +1214,9 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { NVPTX::LDV_f32_v4, NVPTX::LDV_f64_v4); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, std::nullopt, - std::nullopt, NVPTX::LDV_i32_v8, std::nullopt, - NVPTX::LDV_f32_v8, std::nullopt); + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, {/*no i8_v8*/}, {/*no i16_v8*/}, + NVPTX::LDV_i32_v8, {/*no i64_v8*/}, NVPTX::LDV_f32_v8, {/*no f64_v8*/}); break; } if (!Opcode) @@ -1319,13 +1319,13 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE, NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE, - std::nullopt, NVPTX::INT_PTX_LDU_G_v4f32_ELE, std::nullopt); + {/*no v4i64*/}, NVPTX::INT_PTX_LDU_G_v4f32_ELE, {/*no v4f64*/}); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, std::nullopt, - std::nullopt, NVPTX::INT_PTX_LDG_G_v8i32_ELE, - std::nullopt, NVPTX::INT_PTX_LDG_G_v8f32_ELE, - std::nullopt); + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, {/*no i8_v8*/}, {/*no i16_v8*/}, + NVPTX::INT_PTX_LDG_G_v8i32_ELE, {/*no i64_v8*/}, + NVPTX::INT_PTX_LDG_G_v8f32_ELE, {/*no f64_v8*/}); break; } if (!Opcode) @@ -1497,9 +1497,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v4, NVPTX::STV_f64_v4); break; case NVPTXISD::StoreV8: - Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, std::nullopt, - std::nullopt, NVPTX::STV_i32_v8, std::nullopt, - NVPTX::STV_f32_v8, std::nullopt); + Opcode = pickOpcodeForVT( + EltVT.getSimpleVT().SimpleTy, {/*no i8_v8*/}, {/*no i16_v8*/}, + NVPTX::STV_i32_v8, {/*no i64_v8*/}, NVPTX::STV_f32_v8, {/*no f64_v8*/}); break; } @@ -1559,10 +1559,10 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { NVPTX::LoadParamMemV2F64); break; case 4: - Opcode = - pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, - NVPTX::LoadParamMemV4I16, NVPTX::LoadParamMemV4I32, - std::nullopt, NVPTX::LoadParamMemV4F32, std::nullopt); + Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, + NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, + NVPTX::LoadParamMemV4I32, {/*no V4I64*/}, + NVPTX::LoadParamMemV4F32, {/*no V4F64*/}); break; } if (!Opcode) @@ -1653,8 +1653,8 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { case 4: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, - NVPTX::StoreRetvalV4I32, std::nullopt, - NVPTX::StoreRetvalV4F32, std::nullopt); + NVPTX::StoreRetvalV4I32, {/*no V4I64*/}, + NVPTX::StoreRetvalV4F32, {/*no V4F64*/}); break; } if (!Opcode) From 9ca5da5a3a1147373c04acbe1f4da3e66b6ca67e Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Mon, 12 May 2025 16:00:15 +0000 Subject: [PATCH 06/11] Make comments consistent --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 32 +++++++++++---------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index c5ddfbfc8ec20..9e17fa9f97e51 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1214,9 +1214,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) { NVPTX::LDV_f32_v4, NVPTX::LDV_f64_v4); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, {/*no i8_v8*/}, {/*no i16_v8*/}, - NVPTX::LDV_i32_v8, {/*no i64_v8*/}, NVPTX::LDV_f32_v8, {/*no f64_v8*/}); + Opcode = + pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */}, + {/* no v8i16 */}, NVPTX::LDV_i32_v8, {/* no v8i64 */}, + NVPTX::LDV_f32_v8, {/* no v8f64 */}); break; } if (!Opcode) @@ -1319,13 +1320,13 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) { Opcode = pickOpcodeForVT( EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE, NVPTX::INT_PTX_LDU_G_v4i16_ELE, NVPTX::INT_PTX_LDU_G_v4i32_ELE, - {/*no v4i64*/}, NVPTX::INT_PTX_LDU_G_v4f32_ELE, {/*no v4f64*/}); + {/* no v4i64 */}, NVPTX::INT_PTX_LDU_G_v4f32_ELE, {/* no v4f64 */}); break; case NVPTXISD::LoadV8: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, {/*no i8_v8*/}, {/*no i16_v8*/}, - NVPTX::INT_PTX_LDG_G_v8i32_ELE, {/*no i64_v8*/}, - NVPTX::INT_PTX_LDG_G_v8f32_ELE, {/*no f64_v8*/}); + Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */}, + {/* no v8i16 */}, NVPTX::INT_PTX_LDG_G_v8i32_ELE, + {/* no v8i64 */}, NVPTX::INT_PTX_LDG_G_v8f32_ELE, + {/* no v8f64 */}); break; } if (!Opcode) @@ -1497,9 +1498,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { NVPTX::STV_f32_v4, NVPTX::STV_f64_v4); break; case NVPTXISD::StoreV8: - Opcode = pickOpcodeForVT( - EltVT.getSimpleVT().SimpleTy, {/*no i8_v8*/}, {/*no i16_v8*/}, - NVPTX::STV_i32_v8, {/*no i64_v8*/}, NVPTX::STV_f32_v8, {/*no f64_v8*/}); + Opcode = + pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy, {/* no v8i8 */}, + {/* no v8i16 */}, NVPTX::STV_i32_v8, {/* no v8i64 */}, + NVPTX::STV_f32_v8, {/* no v8f64 */}); break; } @@ -1561,8 +1563,8 @@ bool NVPTXDAGToDAGISel::tryLoadParam(SDNode *Node) { case 4: Opcode = pickOpcodeForVT(MemVT.getSimpleVT().SimpleTy, NVPTX::LoadParamMemV4I8, NVPTX::LoadParamMemV4I16, - NVPTX::LoadParamMemV4I32, {/*no V4I64*/}, - NVPTX::LoadParamMemV4F32, {/*no V4F64*/}); + NVPTX::LoadParamMemV4I32, {/* no v4i64 */}, + NVPTX::LoadParamMemV4F32, {/* no v4f64 */}); break; } if (!Opcode) @@ -1653,8 +1655,8 @@ bool NVPTXDAGToDAGISel::tryStoreRetval(SDNode *N) { case 4: Opcode = pickOpcodeForVT(Mem->getMemoryVT().getSimpleVT().SimpleTy, NVPTX::StoreRetvalV4I8, NVPTX::StoreRetvalV4I16, - NVPTX::StoreRetvalV4I32, {/*no V4I64*/}, - NVPTX::StoreRetvalV4F32, {/*no V4F64*/}); + NVPTX::StoreRetvalV4I32, {/* no v4i64 */}, + NVPTX::StoreRetvalV4F32, {/* no v4f64 */}); break; } if (!Opcode) From 502c43796c3cd7cdea659091cc43fc89b23a88e7 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Mon, 12 May 2025 17:37:57 +0000 Subject: [PATCH 07/11] Simplify loop --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 9e17fa9f97e51..71d1da781ab8e 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1461,8 +1461,9 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { unsigned NumElts = getLoadStoreVectorNumElts(N); SmallVector Ops; - for (unsigned I : llvm::seq(NumElts)) - Ops.append({N->getOperand(I + 1)}); + // Append the operands from 1 to NumElts, inclusive + const SDUse *FirstStoredVal = N->ops().begin() + 1; + Ops.append(FirstStoredVal, FirstStoredVal + NumElts); SDValue N2 = N->getOperand(NumElts + 1); unsigned ToTypeWidth = TotalWidth / NumElts; From 327459e377620159226ede48e3eda000579ad9ba Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Mon, 12 May 2025 18:29:40 +0000 Subject: [PATCH 08/11] Fix issues with removal of VecType, all tests pass now --- llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp | 2 +- llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp | 4 ++-- .../MIR/NVPTX/floating-point-immediate-operands.mir | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp index f5063a80b8a15..008209785a683 100644 --- a/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXForwardParams.cpp @@ -105,7 +105,7 @@ static bool eliminateMove(MachineInstr &Mov, const MachineRegisterInfo &MRI, const MachineOperand *ParamSymbol = Mov.uses().begin(); assert(ParamSymbol->isSymbol()); - constexpr unsigned LDInstBasePtrOpIdx = 6; + constexpr unsigned LDInstBasePtrOpIdx = 5; constexpr unsigned LDInstAddrSpaceOpIdx = 2; for (auto *LI : LoadInsts) { (LI->uses().begin() + LDInstBasePtrOpIdx) diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp index 46e4a905aa09a..9b5fe473521a1 100644 --- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp @@ -1808,8 +1808,8 @@ bool NVPTXReplaceImageHandles::replaceImageHandle(MachineOperand &Op, // For CUDA, we preserve the param loads coming from function arguments return false; - assert(TexHandleDef.getOperand(7).isSymbol() && "Load is not a symbol!"); - StringRef Sym = TexHandleDef.getOperand(7).getSymbolName(); + assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!"); + StringRef Sym = TexHandleDef.getOperand(6).getSymbolName(); InstrsToRemove.insert(&TexHandleDef); Op.ChangeToES(Sym.data()); MFI->getImageHandleSymbolIndex(Sym); diff --git a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir index 486c6ca16a531..799a30b094542 100644 --- a/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir +++ b/llvm/test/CodeGen/MIR/NVPTX/floating-point-immediate-operands.mir @@ -40,9 +40,9 @@ registers: - { id: 7, class: float32regs } body: | bb.0.entry: - %0 = LD_f32 0, 0, 4, 1, 2, 32, &test_param_0, 0 + %0 = LD_f32 0, 0, 4, 2, 32, &test_param_0, 0 %1 = CVT_f64_f32 %0, 0 - %2 = LD_i32 0, 0, 4, 1, 0, 32, &test_param_1, 0 + %2 = LD_i32 0, 0, 4, 0, 32, &test_param_1, 0 ; CHECK: %3:float64regs = FADD_rnf64ri %1, double 3.250000e+00 %3 = FADD_rnf64ri %1, double 3.250000e+00 %4 = CVT_f32_f64 %3, 5 @@ -66,9 +66,9 @@ registers: - { id: 7, class: float32regs } body: | bb.0.entry: - %0 = LD_f32 0, 0, 4, 1, 2, 32, &test2_param_0, 0 + %0 = LD_f32 0, 0, 4, 2, 32, &test2_param_0, 0 %1 = CVT_f64_f32 %0, 0 - %2 = LD_i32 0, 0, 4, 1, 0, 32, &test2_param_1, 0 + %2 = LD_i32 0, 0, 4, 0, 32, &test2_param_1, 0 ; CHECK: %3:float64regs = FADD_rnf64ri %1, double 0x7FF8000000000000 %3 = FADD_rnf64ri %1, double 0x7FF8000000000000 %4 = CVT_f32_f64 %3, 5 From 7bc66d08ee5289cd38598381bd6b6b8487fc0cba Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Mon, 12 May 2025 19:19:10 +0000 Subject: [PATCH 09/11] Update test checks to ld/st untyped data --- llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll | 72 +-- .../load-store-256-addressing-invariant.ll | 96 +-- .../NVPTX/load-store-256-addressing.ll | 128 ++-- .../CodeGen/NVPTX/load-store-vectors-256.ll | 608 +++++++++--------- 4 files changed, 452 insertions(+), 452 deletions(-) diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index 8381b3b7dbea4..fbb5c9ab5cf49 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -15,8 +15,8 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v32i8_param_0]; -; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; ; SM90-NEXT: bfe.u32 %r5, %r4, 0, 8; ; SM90-NEXT: cvt.u16.u32 %rs1, %r5; ; SM90-NEXT: bfe.u32 %r6, %r3, 0, 8; @@ -25,7 +25,7 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) { ; SM90-NEXT: cvt.u16.u32 %rs3, %r7; ; SM90-NEXT: bfe.u32 %r8, %r1, 0, 8; ; SM90-NEXT: cvt.u16.u32 %rs4, %r8; -; SM90-NEXT: ld.global.nc.v4.u32 {%r9, %r10, %r11, %r12}, [%rd1]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r9, %r10, %r11, %r12}, [%rd1]; ; SM90-NEXT: bfe.u32 %r13, %r12, 0, 8; ; SM90-NEXT: cvt.u16.u32 %rs5, %r13; ; SM90-NEXT: bfe.u32 %r14, %r11, 0, 8; @@ -53,7 +53,7 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v32i8_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0]; ; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8; ; SM100-NEXT: cvt.u16.u32 %rs1, %r9; @@ -109,13 +109,13 @@ define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v16i16_param_0]; -; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; ; SM90-NEXT: mov.b32 {%rs1, _}, %r4; ; SM90-NEXT: mov.b32 {%rs2, _}, %r3; ; SM90-NEXT: mov.b32 {%rs3, _}, %r2; ; SM90-NEXT: mov.b32 {%rs4, _}, %r1; -; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; ; SM90-NEXT: mov.b32 {%rs5, _}, %r8; ; SM90-NEXT: mov.b32 {%rs6, _}, %r7; ; SM90-NEXT: mov.b32 {%rs7, _}, %r6; @@ -138,7 +138,7 @@ define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v16i16_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0]; ; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; @@ -185,13 +185,13 @@ define half @ld_global_v16f16(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v16f16_param_0]; -; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; ; SM90-NEXT: mov.b32 {%rs1, _}, %r4; ; SM90-NEXT: mov.b32 {%rs2, _}, %r3; ; SM90-NEXT: mov.b32 {%rs3, _}, %r2; ; SM90-NEXT: mov.b32 {%rs4, _}, %r1; -; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; ; SM90-NEXT: mov.b32 {%rs5, _}, %r8; ; SM90-NEXT: mov.b32 {%rs6, _}, %r7; ; SM90-NEXT: mov.b32 {%rs7, _}, %r6; @@ -213,7 +213,7 @@ define half @ld_global_v16f16(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v16f16_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0]; ; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; @@ -259,13 +259,13 @@ define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v16bf16_param_0]; -; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; ; SM90-NEXT: mov.b32 {%rs1, _}, %r4; ; SM90-NEXT: mov.b32 {%rs2, _}, %r3; ; SM90-NEXT: mov.b32 {%rs3, _}, %r2; ; SM90-NEXT: mov.b32 {%rs4, _}, %r1; -; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; ; SM90-NEXT: mov.b32 {%rs5, _}, %r8; ; SM90-NEXT: mov.b32 {%rs6, _}, %r7; ; SM90-NEXT: mov.b32 {%rs7, _}, %r6; @@ -287,7 +287,7 @@ define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v16bf16_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0]; ; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; @@ -332,9 +332,9 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v8i32_param_0]; -; SM90-NEXT: ld.global.nc.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1+16]; -; SM90-NEXT: ld.global.nc.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1]; ; SM90-NEXT: add.s32 %r9, %r5, %r6; ; SM90-NEXT: add.s32 %r10, %r7, %r8; ; SM90-NEXT: add.s32 %r11, %r1, %r2; @@ -351,7 +351,7 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v8i32_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; ; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: add.s32 %r9, %r1, %r2; ; SM100-NEXT: add.s32 %r10, %r3, %r4; @@ -389,9 +389,9 @@ define float @ld_global_v8f32(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<2>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v8f32_param_0]; -; SM90-NEXT: ld.global.nc.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1+16]; -; SM90-NEXT: ld.global.nc.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0]; +; SM90-NEXT: ld.global.nc.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1]; ; SM90-NEXT: add.rn.f32 %f9, %f5, %f6; ; SM90-NEXT: add.rn.f32 %f10, %f7, %f8; ; SM90-NEXT: add.rn.f32 %f11, %f1, %f2; @@ -399,7 +399,7 @@ define float @ld_global_v8f32(ptr addrspace(1) %ptr) { ; SM90-NEXT: add.rn.f32 %f13, %f9, %f10; ; SM90-NEXT: add.rn.f32 %f14, %f11, %f12; ; SM90-NEXT: add.rn.f32 %f15, %f13, %f14; -; SM90-NEXT: st.param.f32 [func_retval0], %f15; +; SM90-NEXT: st.param.b32 [func_retval0], %f15; ; SM90-NEXT: ret; ; ; SM100-LABEL: ld_global_v8f32( @@ -408,7 +408,7 @@ define float @ld_global_v8f32(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<2>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v8f32_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0]; ; SM100-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; ; SM100-NEXT: add.rn.f32 %f9, %f1, %f2; ; SM100-NEXT: add.rn.f32 %f10, %f3, %f4; @@ -417,7 +417,7 @@ define float @ld_global_v8f32(ptr addrspace(1) %ptr) { ; SM100-NEXT: add.rn.f32 %f13, %f9, %f10; ; SM100-NEXT: add.rn.f32 %f14, %f11, %f12; ; SM100-NEXT: add.rn.f32 %f15, %f13, %f14; -; SM100-NEXT: st.param.f32 [func_retval0], %f15; +; SM100-NEXT: st.param.b32 [func_retval0], %f15; ; SM100-NEXT: ret; %a = load <8 x float>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <8 x float> %a, i32 0 @@ -445,9 +445,9 @@ define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %rd<9>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v4i64_param_0]; -; SM90-NEXT: ld.global.nc.v2.u64 {%rd2, %rd3}, [%rd1+16]; -; SM90-NEXT: ld.global.nc.v2.u64 {%rd4, %rd5}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0]; +; SM90-NEXT: ld.global.nc.v2.b64 {%rd2, %rd3}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v2.b64 {%rd4, %rd5}, [%rd1]; ; SM90-NEXT: add.s64 %rd6, %rd4, %rd5; ; SM90-NEXT: add.s64 %rd7, %rd2, %rd3; ; SM90-NEXT: add.s64 %rd8, %rd6, %rd7; @@ -459,7 +459,7 @@ define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %rd<9>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v4i64_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0]; ; SM100-NEXT: ld.global.nc.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; ; SM100-NEXT: add.s64 %rd6, %rd2, %rd3; ; SM100-NEXT: add.s64 %rd7, %rd4, %rd5; @@ -484,13 +484,13 @@ define double @ld_global_v4f64(ptr addrspace(1) %ptr) { ; SM90-NEXT: .reg .b64 %fd<8>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [ld_global_v4f64_param_0]; -; SM90-NEXT: ld.global.nc.v2.f64 {%fd1, %fd2}, [%rd1+16]; -; SM90-NEXT: ld.global.nc.v2.f64 {%fd3, %fd4}, [%rd1]; +; SM90-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0]; +; SM90-NEXT: ld.global.nc.v2.b64 {%fd1, %fd2}, [%rd1+16]; +; SM90-NEXT: ld.global.nc.v2.b64 {%fd3, %fd4}, [%rd1]; ; SM90-NEXT: add.rn.f64 %fd5, %fd3, %fd4; ; SM90-NEXT: add.rn.f64 %fd6, %fd1, %fd2; ; SM90-NEXT: add.rn.f64 %fd7, %fd5, %fd6; -; SM90-NEXT: st.param.f64 [func_retval0], %fd7; +; SM90-NEXT: st.param.b64 [func_retval0], %fd7; ; SM90-NEXT: ret; ; ; SM100-LABEL: ld_global_v4f64( @@ -499,12 +499,12 @@ define double @ld_global_v4f64(ptr addrspace(1) %ptr) { ; SM100-NEXT: .reg .b64 %fd<8>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [ld_global_v4f64_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0]; ; SM100-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; ; SM100-NEXT: add.rn.f64 %fd5, %fd1, %fd2; ; SM100-NEXT: add.rn.f64 %fd6, %fd3, %fd4; ; SM100-NEXT: add.rn.f64 %fd7, %fd5, %fd6; -; SM100-NEXT: st.param.f64 [func_retval0], %fd7; +; SM100-NEXT: st.param.b64 [func_retval0], %fd7; ; SM100-NEXT: ret; %a = load <4 x double>, ptr addrspace(1) %ptr, !invariant.load !0 %v1 = extractelement <4 x double> %a, i32 0 diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index 0d051848c0d06..eed70d8b42c48 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -86,7 +86,7 @@ define void @avar_i32() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.u32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0 store <8 x i32> %load, ptr addrspace(1) @globalout @@ -100,7 +100,7 @@ define void @avar_i64() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; -; PTX-NEXT: st.global.v4.u64 [globalout], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <4 x i64>, ptr addrspace(1) @globalin, !invariant.load !0 store <4 x i64> %load, ptr addrspace(1) @globalout @@ -114,7 +114,7 @@ define void @avar_float() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; -; PTX-NEXT: st.global.v8.f32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0 store <8 x float> %load, ptr addrspace(1) @globalout @@ -128,7 +128,7 @@ define void @avar_double() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; -; PTX-NEXT: st.global.v4.f64 [globalout], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0 store <4 x double> %load, ptr addrspace(1) @globalout @@ -206,7 +206,7 @@ define void @asi_i32() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.u32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -222,7 +222,7 @@ define void @asi_i64() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; -; PTX-NEXT: st.global.v4.u64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -238,7 +238,7 @@ define void @asi_float() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; -; PTX-NEXT: st.global.v8.f32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -254,7 +254,7 @@ define void @asi_double() { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; -; PTX-NEXT: st.global.v4.f64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -270,9 +270,9 @@ define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i8_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i8_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <32 x i8>, ptr addrspace(1) %in, !invariant.load !0 @@ -286,9 +286,9 @@ define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i16_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i16_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x i16>, ptr addrspace(1) %in, !invariant.load !0 @@ -302,9 +302,9 @@ define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_half_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_half_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x half>, ptr addrspace(1) %in, !invariant.load !0 @@ -318,9 +318,9 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_bfloat_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_bfloat_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x bfloat>, ptr addrspace(1) %in, !invariant.load !0 @@ -335,10 +335,10 @@ define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i32_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i32_param_1]; -; PTX-NEXT: st.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) %in, !invariant.load !0 store <8 x i32> %load, ptr addrspace(1) %out @@ -351,10 +351,10 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i64_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0]; ; PTX-NEXT: ld.global.nc.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd6, [areg_64_i64_param_1]; -; PTX-NEXT: st.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <4 x i64>, ptr addrspace(1) %in, !invariant.load !0 store <4 x i64> %load, ptr addrspace(1) %out @@ -368,10 +368,10 @@ define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_float_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0]; ; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_float_param_1]; -; PTX-NEXT: st.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) %in, !invariant.load !0 store <8 x float> %load, ptr addrspace(1) %out @@ -385,10 +385,10 @@ define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_double_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0]; ; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_double_param_1]; -; PTX-NEXT: st.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_double_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %load = load <4 x double>, ptr addrspace(1) %in, !invariant.load !0 store <4 x double> %load, ptr addrspace(1) %out @@ -402,8 +402,8 @@ define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i8_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i8_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -421,8 +421,8 @@ define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i16_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i16_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -440,8 +440,8 @@ define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_half_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_half_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -459,8 +459,8 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_bfloat_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_bfloat_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -478,10 +478,10 @@ define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i32_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i32_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; ; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.u32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -496,10 +496,10 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i64_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i64_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1]; ; PTX-NEXT: ld.global.nc.v4.u64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; -; PTX-NEXT: st.global.v4.u64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <4 x i64>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -515,10 +515,10 @@ define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_float_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_float_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1]; ; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.f32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset, !invariant.load !0 @@ -534,10 +534,10 @@ define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_double_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_double_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1]; ; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; -; PTX-NEXT: st.global.v4.f64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <4 x double>, ptr addrspace(1) %in.offset, !invariant.load !0 diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll index 4543f75dfa1eb..0e61478520abb 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing.ll @@ -81,8 +81,8 @@ define void @avar_i32() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; -; PTX-NEXT: st.global.v8.u32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin store <8 x i32> %load, ptr addrspace(1) @globalout @@ -95,8 +95,8 @@ define void @avar_i64() { ; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; -; PTX-NEXT: st.global.v4.u64 [globalout], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <4 x i64>, ptr addrspace(1) @globalin store <4 x i64> %load, ptr addrspace(1) @globalout @@ -109,8 +109,8 @@ define void @avar_float() { ; PTX-NEXT: .reg .b32 %f<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; -; PTX-NEXT: st.global.v8.f32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; +; PTX-NEXT: st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) @globalin store <8 x float> %load, ptr addrspace(1) @globalout @@ -123,8 +123,8 @@ define void @avar_double() { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; -; PTX-NEXT: st.global.v4.f64 [globalout], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; +; PTX-NEXT: st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %load = load <4 x double>, ptr addrspace(1) @globalin store <4 x double> %load, ptr addrspace(1) @globalout @@ -201,8 +201,8 @@ define void @asi_i32() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; -; PTX-NEXT: st.global.v8.u32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset @@ -217,8 +217,8 @@ define void @asi_i64() { ; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; -; PTX-NEXT: st.global.v4.u64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; +; PTX-NEXT: ld.global.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <4 x i64>, ptr addrspace(1) %in.offset @@ -233,8 +233,8 @@ define void @asi_float() { ; PTX-NEXT: .reg .b32 %f<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; -; PTX-NEXT: st.global.v8.f32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; +; PTX-NEXT: st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset @@ -249,8 +249,8 @@ define void @asi_double() { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; -; PTX-NEXT: st.global.v4.f64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; +; PTX-NEXT: st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 %load = load <4 x double>, ptr addrspace(1) %in.offset @@ -266,9 +266,9 @@ define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i8_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i8_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <32 x i8>, ptr addrspace(1) %in @@ -282,9 +282,9 @@ define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i16_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i16_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x i16>, ptr addrspace(1) %in @@ -298,9 +298,9 @@ define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_half_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_half_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x half>, ptr addrspace(1) %in @@ -314,9 +314,9 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_bfloat_param_0]; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_bfloat_param_1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x bfloat>, ptr addrspace(1) %in @@ -331,10 +331,10 @@ define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i32_param_0]; -; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_i32_param_1]; -; PTX-NEXT: st.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) %in store <8 x i32> %load, ptr addrspace(1) %out @@ -347,10 +347,10 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_i64_param_0]; -; PTX-NEXT: ld.global.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd6, [areg_64_i64_param_1]; -; PTX-NEXT: st.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0]; +; PTX-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; %load = load <4 x i64>, ptr addrspace(1) %in store <4 x i64> %load, ptr addrspace(1) %out @@ -364,10 +364,10 @@ define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_float_param_0]; -; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_float_param_1]; -; PTX-NEXT: st.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0]; +; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1]; +; PTX-NEXT: st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) %in store <8 x float> %load, ptr addrspace(1) %out @@ -381,10 +381,10 @@ define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [areg_64_double_param_0]; -; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; -; PTX-NEXT: ld.param.u64 %rd2, [areg_64_double_param_1]; -; PTX-NEXT: st.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0]; +; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; PTX-NEXT: ld.param.b64 %rd2, [areg_64_double_param_1]; +; PTX-NEXT: st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %load = load <4 x double>, ptr addrspace(1) %in store <4 x double> %load, ptr addrspace(1) %out @@ -398,8 +398,8 @@ define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i8_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i8_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -417,8 +417,8 @@ define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i16_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i16_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -436,8 +436,8 @@ define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_half_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_half_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -455,8 +455,8 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_bfloat_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_bfloat_param_1]; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1]; ; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -474,10 +474,10 @@ define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i32_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i32_param_1]; -; PTX-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.u32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; +; PTX-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x i32>, ptr addrspace(1) %in.offset @@ -492,10 +492,10 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<7>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_i64_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_i64_param_1]; -; PTX-NEXT: ld.global.v4.u64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; -; PTX-NEXT: st.global.v4.u64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1]; +; PTX-NEXT: ld.global.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <4 x i64>, ptr addrspace(1) %in.offset @@ -511,10 +511,10 @@ define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %rd<3>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_float_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_float_param_1]; -; PTX-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; -; PTX-NEXT: st.global.v8.f32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1]; +; PTX-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; +; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <8 x float>, ptr addrspace(1) %in.offset @@ -530,10 +530,10 @@ define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.param.u64 %rd1, [ari_64_double_param_0]; -; PTX-NEXT: ld.param.u64 %rd2, [ari_64_double_param_1]; -; PTX-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; -; PTX-NEXT: st.global.v4.f64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; +; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0]; +; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1]; +; PTX-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; +; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 %load = load <4 x double>, ptr addrspace(1) %in.offset diff --git a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll index 2d19c308e4f3a..306e71eadca70 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-vectors-256.ll @@ -30,10 +30,10 @@ define void @generic_32xi8(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_32xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_32xi8_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_32xi8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_32xi8_param_1]; ; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -49,10 +49,10 @@ define void @generic_16xi16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xi16_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_16xi16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_16xi16_param_1]; ; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -68,10 +68,10 @@ define void @generic_16xhalf(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xhalf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xhalf_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_16xhalf_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_16xhalf_param_1]; ; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -87,10 +87,10 @@ define void @generic_16xbfloat(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_16xbfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_16xbfloat_param_0]; ; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_16xbfloat_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_16xbfloat_param_1]; ; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -106,12 +106,12 @@ define void @generic_8xi32(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xi32_param_0]; -; CHECK-NEXT: ld.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_8xi32_param_1]; -; CHECK-NEXT: st.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xi32_param_0]; +; CHECK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xi32_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %a.load = load <8 x i32>, ptr %a store <8 x i32> %a.load, ptr %b @@ -124,12 +124,12 @@ define void @generic_4xi64(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xi64_param_0]; -; CHECK-NEXT: ld.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd6, [generic_4xi64_param_1]; -; CHECK-NEXT: st.v2.u64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xi64_param_0]; +; CHECK-NEXT: ld.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [generic_4xi64_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load <4 x i64>, ptr %a store <4 x i64> %a.load, ptr %b @@ -143,12 +143,12 @@ define void @generic_8xfloat(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_8xfloat_param_0]; -; CHECK-NEXT: ld.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: ld.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_8xfloat_param_1]; -; CHECK-NEXT: st.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; CHECK-NEXT: st.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_8xfloat_param_0]; +; CHECK-NEXT: ld.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_8xfloat_param_1]; +; CHECK-NEXT: st.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; CHECK-NEXT: ret; %a.load = load <8 x float>, ptr %a store <8 x float> %a.load, ptr %b @@ -162,12 +162,12 @@ define void @generic_4xdouble(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_4xdouble_param_0]; -; CHECK-NEXT: ld.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: ld.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_4xdouble_param_1]; -; CHECK-NEXT: st.v2.f64 [%rd2+16], {%fd3, %fd4}; -; CHECK-NEXT: st.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_4xdouble_param_0]; +; CHECK-NEXT: ld.v2.b64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_4xdouble_param_1]; +; CHECK-NEXT: st.v2.b64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.v2.b64 [%rd2], {%fd1, %fd2}; ; CHECK-NEXT: ret; %a.load = load <4 x double>, ptr %a store <4 x double> %a.load, ptr %b @@ -183,10 +183,10 @@ define void @generic_volatile_32xi8(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_32xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_32xi8_param_0]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_32xi8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_32xi8_param_1]; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -202,10 +202,10 @@ define void @generic_volatile_16xi16(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xi16_param_0]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_16xi16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_16xi16_param_1]; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -221,10 +221,10 @@ define void @generic_volatile_16xhalf(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xhalf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xhalf_param_0]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_16xhalf_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_16xhalf_param_1]; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -240,10 +240,10 @@ define void @generic_volatile_16xbfloat(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_16xbfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_16xbfloat_param_0]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_16xbfloat_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_16xbfloat_param_1]; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -259,12 +259,12 @@ define void @generic_volatile_8xi32(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.volatile.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_8xi32_param_1]; -; CHECK-NEXT: st.volatile.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xi32_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_8xi32_param_1]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x i32>, ptr %a store volatile <8 x i32> %a.load, ptr %b @@ -277,12 +277,12 @@ define void @generic_volatile_4xi64(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xi64_param_0]; -; CHECK-NEXT: ld.volatile.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.volatile.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd6, [generic_volatile_4xi64_param_1]; -; CHECK-NEXT: st.volatile.v2.u64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.volatile.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xi64_param_0]; +; CHECK-NEXT: ld.volatile.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.volatile.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [generic_volatile_4xi64_param_1]; +; CHECK-NEXT: st.volatile.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.volatile.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i64>, ptr %a store volatile <4 x i64> %a.load, ptr %b @@ -296,12 +296,12 @@ define void @generic_volatile_8xfloat(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: ld.volatile.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; CHECK-NEXT: st.volatile.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_8xfloat_param_0]; +; CHECK-NEXT: ld.volatile.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.volatile.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.volatile.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x float>, ptr %a store volatile <8 x float> %a.load, ptr %b @@ -315,12 +315,12 @@ define void @generic_volatile_4xdouble(ptr %a, ptr %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [generic_volatile_4xdouble_param_0]; -; CHECK-NEXT: ld.volatile.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: ld.volatile.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [generic_volatile_4xdouble_param_1]; -; CHECK-NEXT: st.volatile.v2.f64 [%rd2+16], {%fd3, %fd4}; -; CHECK-NEXT: st.volatile.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ld.param.b64 %rd1, [generic_volatile_4xdouble_param_0]; +; CHECK-NEXT: ld.volatile.v2.b64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.volatile.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [generic_volatile_4xdouble_param_1]; +; CHECK-NEXT: st.volatile.v2.b64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.volatile.v2.b64 [%rd2], {%fd1, %fd2}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x double>, ptr %a store volatile <4 x double> %a.load, ptr %b @@ -338,10 +338,10 @@ define void @global_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_32xi8_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_32xi8_param_0]; ; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_32xi8_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_32xi8_param_1]; ; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -352,9 +352,9 @@ define void @global_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_32xi8_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_32xi8_param_0]; ; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_32xi8_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_32xi8_param_1]; ; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load <32 x i8>, ptr addrspace(1) %a @@ -369,10 +369,10 @@ define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_16xi16_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0]; ; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_16xi16_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1]; ; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -383,9 +383,9 @@ define void @global_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_16xi16_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_16xi16_param_0]; ; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_16xi16_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_16xi16_param_1]; ; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load <16 x i16>, ptr addrspace(1) %a @@ -400,10 +400,10 @@ define void @global_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_16xhalf_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_16xhalf_param_0]; ; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_16xhalf_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_16xhalf_param_1]; ; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -414,9 +414,9 @@ define void @global_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_16xhalf_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_16xhalf_param_0]; ; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_16xhalf_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_16xhalf_param_1]; ; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load <16 x half>, ptr addrspace(1) %a @@ -431,10 +431,10 @@ define void @global_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_16xbfloat_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_16xbfloat_param_0]; ; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_16xbfloat_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_16xbfloat_param_1]; ; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -445,9 +445,9 @@ define void @global_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_16xbfloat_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_16xbfloat_param_0]; ; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_16xbfloat_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_16xbfloat_param_1]; ; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load <16 x bfloat>, ptr addrspace(1) %a @@ -462,12 +462,12 @@ define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_8xi32_param_0]; -; SM90-NEXT: ld.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: ld.global.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_8xi32_param_1]; -; SM90-NEXT: st.global.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; SM90-NEXT: st.global.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_8xi32( @@ -476,10 +476,10 @@ define void @global_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_8xi32_param_0]; -; SM100-NEXT: ld.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_8xi32_param_1]; -; SM100-NEXT: st.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_8xi32_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(1) %a store <8 x i32> %a.load, ptr addrspace(1) %b @@ -492,12 +492,12 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<7>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_4xi64_param_0]; -; SM90-NEXT: ld.global.v2.u64 {%rd2, %rd3}, [%rd1]; -; SM90-NEXT: ld.global.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd6, [global_4xi64_param_1]; -; SM90-NEXT: st.global.v2.u64 [%rd6+16], {%rd4, %rd5}; -; SM90-NEXT: st.global.v2.u64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0]; +; SM90-NEXT: ld.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1]; +; SM90-NEXT: st.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.global.v2.b64 [%rd6], {%rd2, %rd3}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_4xi64( @@ -505,10 +505,10 @@ define void @global_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_4xi64_param_0]; -; SM100-NEXT: ld.global.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd6, [global_4xi64_param_1]; -; SM100-NEXT: st.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ld.param.b64 %rd1, [global_4xi64_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_4xi64_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load <4 x i64>, ptr addrspace(1) %a store <4 x i64> %a.load, ptr addrspace(1) %b @@ -522,12 +522,12 @@ define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_8xfloat_param_0]; -; SM90-NEXT: ld.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; SM90-NEXT: ld.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_8xfloat_param_1]; -; SM90-NEXT: st.global.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; SM90-NEXT: st.global.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; SM90-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; +; SM90-NEXT: ld.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; SM90-NEXT: ld.global.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; +; SM90-NEXT: st.global.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; SM90-NEXT: st.global.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_8xfloat( @@ -536,10 +536,10 @@ define void @global_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_8xfloat_param_0]; -; SM100-NEXT: ld.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_8xfloat_param_1]; -; SM100-NEXT: st.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; SM100-NEXT: ld.param.b64 %rd1, [global_8xfloat_param_0]; +; SM100-NEXT: ld.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_8xfloat_param_1]; +; SM100-NEXT: st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; SM100-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(1) %a store <8 x float> %a.load, ptr addrspace(1) %b @@ -553,12 +553,12 @@ define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %fd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_4xdouble_param_0]; -; SM90-NEXT: ld.global.v2.f64 {%fd1, %fd2}, [%rd1]; -; SM90-NEXT: ld.global.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_4xdouble_param_1]; -; SM90-NEXT: st.global.v2.f64 [%rd2+16], {%fd3, %fd4}; -; SM90-NEXT: st.global.v2.f64 [%rd2], {%fd1, %fd2}; +; SM90-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0]; +; SM90-NEXT: ld.global.v2.b64 {%fd1, %fd2}, [%rd1]; +; SM90-NEXT: ld.global.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_4xdouble_param_1]; +; SM90-NEXT: st.global.v2.b64 [%rd2+16], {%fd3, %fd4}; +; SM90-NEXT: st.global.v2.b64 [%rd2], {%fd1, %fd2}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_4xdouble( @@ -567,10 +567,10 @@ define void @global_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %fd<5>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_4xdouble_param_0]; -; SM100-NEXT: ld.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_4xdouble_param_1]; -; SM100-NEXT: st.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; SM100-NEXT: ld.param.b64 %rd1, [global_4xdouble_param_0]; +; SM100-NEXT: ld.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_4xdouble_param_1]; +; SM100-NEXT: st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; ; SM100-NEXT: ret; %a.load = load <4 x double>, ptr addrspace(1) %a store <4 x double> %a.load, ptr addrspace(1) %b @@ -586,10 +586,10 @@ define void @global_volatile_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_32xi8_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_32xi8_param_0]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_32xi8_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_32xi8_param_1]; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -600,9 +600,9 @@ define void @global_volatile_32xi8(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_32xi8_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_32xi8_param_0]; ; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_32xi8_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_32xi8_param_1]; ; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load volatile <32 x i8>, ptr addrspace(1) %a @@ -617,10 +617,10 @@ define void @global_volatile_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_16xi16_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_16xi16_param_0]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_16xi16_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_16xi16_param_1]; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -631,9 +631,9 @@ define void @global_volatile_16xi16(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_16xi16_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_16xi16_param_0]; ; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_16xi16_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_16xi16_param_1]; ; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load volatile <16 x i16>, ptr addrspace(1) %a @@ -648,10 +648,10 @@ define void @global_volatile_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_16xhalf_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_16xhalf_param_0]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_16xhalf_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_16xhalf_param_1]; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -662,9 +662,9 @@ define void @global_volatile_16xhalf(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_16xhalf_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_16xhalf_param_0]; ; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_16xhalf_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_16xhalf_param_1]; ; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load volatile <16 x half>, ptr addrspace(1) %a @@ -679,10 +679,10 @@ define void @global_volatile_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_16xbfloat_param_0]; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_16xbfloat_param_0]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_16xbfloat_param_1]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_16xbfloat_param_1]; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; @@ -693,9 +693,9 @@ define void @global_volatile_16xbfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_16xbfloat_param_0]; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_16xbfloat_param_0]; ; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_16xbfloat_param_1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_16xbfloat_param_1]; ; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load volatile <16 x bfloat>, ptr addrspace(1) %a @@ -710,12 +710,12 @@ define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_8xi32_param_0]; -; SM90-NEXT: ld.volatile.global.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_8xi32_param_1]; -; SM90-NEXT: st.volatile.global.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; SM90-NEXT: st.volatile.global.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xi32_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_8xi32( @@ -724,10 +724,10 @@ define void @global_volatile_8xi32(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_8xi32_param_0]; -; SM100-NEXT: ld.volatile.global.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_8xi32_param_1]; -; SM100-NEXT: st.volatile.global.v8.u32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xi32_param_0]; +; SM100-NEXT: ld.volatile.global.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_8xi32_param_1]; +; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; SM100-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(1) %a store volatile <8 x i32> %a.load, ptr addrspace(1) %b @@ -740,12 +740,12 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<7>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_4xi64_param_0]; -; SM90-NEXT: ld.volatile.global.v2.u64 {%rd2, %rd3}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd6, [global_volatile_4xi64_param_1]; -; SM90-NEXT: st.volatile.global.v2.u64 [%rd6+16], {%rd4, %rd5}; -; SM90-NEXT: st.volatile.global.v2.u64 [%rd6], {%rd2, %rd3}; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_4xi64_param_0]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd2, %rd3}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd6, [global_volatile_4xi64_param_1]; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6+16], {%rd4, %rd5}; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd6], {%rd2, %rd3}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_4xi64( @@ -753,10 +753,10 @@ define void @global_volatile_4xi64(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<7>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_4xi64_param_0]; -; SM100-NEXT: ld.volatile.global.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd6, [global_volatile_4xi64_param_1]; -; SM100-NEXT: st.volatile.global.v4.u64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_4xi64_param_0]; +; SM100-NEXT: ld.volatile.global.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd6, [global_volatile_4xi64_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; SM100-NEXT: ret; %a.load = load volatile <4 x i64>, ptr addrspace(1) %a store volatile <4 x i64> %a.load, ptr addrspace(1) %b @@ -770,12 +770,12 @@ define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM90-NEXT: .reg .b64 %rd<3>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_8xfloat_param_0]; -; SM90-NEXT: ld.volatile.global.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_8xfloat_param_1]; -; SM90-NEXT: st.volatile.global.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; SM90-NEXT: st.volatile.global.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1]; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; SM90-NEXT: st.volatile.global.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_8xfloat( @@ -784,10 +784,10 @@ define void @global_volatile_8xfloat(ptr addrspace(1) %a, ptr addrspace(1) %b) { ; SM100-NEXT: .reg .b64 %rd<3>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_8xfloat_param_0]; -; SM100-NEXT: ld.volatile.global.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_8xfloat_param_1]; -; SM100-NEXT: st.volatile.global.v8.f32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_8xfloat_param_0]; +; SM100-NEXT: ld.volatile.global.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_8xfloat_param_1]; +; SM100-NEXT: st.volatile.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; SM100-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(1) %a store volatile <8 x float> %a.load, ptr addrspace(1) %b @@ -801,12 +801,12 @@ define void @global_volatile_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) ; SM90-NEXT: .reg .b64 %fd<5>; ; SM90-EMPTY: ; SM90-NEXT: // %bb.0: -; SM90-NEXT: ld.param.u64 %rd1, [global_volatile_4xdouble_param_0]; -; SM90-NEXT: ld.volatile.global.v2.f64 {%fd1, %fd2}, [%rd1]; -; SM90-NEXT: ld.volatile.global.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; SM90-NEXT: ld.param.u64 %rd2, [global_volatile_4xdouble_param_1]; -; SM90-NEXT: st.volatile.global.v2.f64 [%rd2+16], {%fd3, %fd4}; -; SM90-NEXT: st.volatile.global.v2.f64 [%rd2], {%fd1, %fd2}; +; SM90-NEXT: ld.param.b64 %rd1, [global_volatile_4xdouble_param_0]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%fd1, %fd2}, [%rd1]; +; SM90-NEXT: ld.volatile.global.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; SM90-NEXT: ld.param.b64 %rd2, [global_volatile_4xdouble_param_1]; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd2+16], {%fd3, %fd4}; +; SM90-NEXT: st.volatile.global.v2.b64 [%rd2], {%fd1, %fd2}; ; SM90-NEXT: ret; ; ; SM100-LABEL: global_volatile_4xdouble( @@ -815,10 +815,10 @@ define void @global_volatile_4xdouble(ptr addrspace(1) %a, ptr addrspace(1) %b) ; SM100-NEXT: .reg .b64 %fd<5>; ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: -; SM100-NEXT: ld.param.u64 %rd1, [global_volatile_4xdouble_param_0]; -; SM100-NEXT: ld.volatile.global.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; -; SM100-NEXT: ld.param.u64 %rd2, [global_volatile_4xdouble_param_1]; -; SM100-NEXT: st.volatile.global.v4.f64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; +; SM100-NEXT: ld.param.b64 %rd1, [global_volatile_4xdouble_param_0]; +; SM100-NEXT: ld.volatile.global.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; SM100-NEXT: ld.param.b64 %rd2, [global_volatile_4xdouble_param_1]; +; SM100-NEXT: st.volatile.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; ; SM100-NEXT: ret; %a.load = load volatile <4 x double>, ptr addrspace(1) %a store volatile <4 x double> %a.load, ptr addrspace(1) %b @@ -836,10 +836,10 @@ define void @shared_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_32xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_32xi8_param_0]; ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_32xi8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_32xi8_param_1]; ; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -855,10 +855,10 @@ define void @shared_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xi16_param_0]; ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_16xi16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_16xi16_param_1]; ; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -874,10 +874,10 @@ define void @shared_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xhalf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xhalf_param_0]; ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_16xhalf_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_16xhalf_param_1]; ; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -893,10 +893,10 @@ define void @shared_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_16xbfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_16xbfloat_param_0]; ; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_16xbfloat_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_16xbfloat_param_1]; ; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -912,12 +912,12 @@ define void @shared_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xi32_param_0]; -; CHECK-NEXT: ld.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.shared.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_8xi32_param_1]; -; CHECK-NEXT: st.shared.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.shared.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xi32_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_8xi32_param_1]; +; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(3) %a store <8 x i32> %a.load, ptr addrspace(3) %b @@ -930,12 +930,12 @@ define void @shared_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xi64_param_0]; -; CHECK-NEXT: ld.shared.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.shared.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd6, [shared_4xi64_param_1]; -; CHECK-NEXT: st.shared.v2.u64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.shared.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xi64_param_0]; +; CHECK-NEXT: ld.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [shared_4xi64_param_1]; +; CHECK-NEXT: st.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.shared.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load <4 x i64>, ptr addrspace(3) %a store <4 x i64> %a.load, ptr addrspace(3) %b @@ -949,12 +949,12 @@ define void @shared_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_8xfloat_param_0]; -; CHECK-NEXT: ld.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: ld.shared.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_8xfloat_param_1]; -; CHECK-NEXT: st.shared.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; CHECK-NEXT: st.shared.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_8xfloat_param_0]; +; CHECK-NEXT: ld.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.shared.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_8xfloat_param_1]; +; CHECK-NEXT: st.shared.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.shared.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; CHECK-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(3) %a store <8 x float> %a.load, ptr addrspace(3) %b @@ -968,12 +968,12 @@ define void @shared_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_4xdouble_param_0]; -; CHECK-NEXT: ld.shared.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: ld.shared.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_4xdouble_param_1]; -; CHECK-NEXT: st.shared.v2.f64 [%rd2+16], {%fd3, %fd4}; -; CHECK-NEXT: st.shared.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_4xdouble_param_0]; +; CHECK-NEXT: ld.shared.v2.b64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.shared.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_4xdouble_param_1]; +; CHECK-NEXT: st.shared.v2.b64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.shared.v2.b64 [%rd2], {%fd1, %fd2}; ; CHECK-NEXT: ret; %a.load = load <4 x double>, ptr addrspace(3) %a store <4 x double> %a.load, ptr addrspace(3) %b @@ -989,10 +989,10 @@ define void @shared_volatile_32xi8(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_32xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_32xi8_param_0]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_32xi8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_32xi8_param_1]; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1008,10 +1008,10 @@ define void @shared_volatile_16xi16(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xi16_param_0]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_16xi16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_16xi16_param_1]; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1027,10 +1027,10 @@ define void @shared_volatile_16xhalf(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xhalf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xhalf_param_0]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_16xhalf_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_16xhalf_param_1]; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1046,10 +1046,10 @@ define void @shared_volatile_16xbfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_16xbfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_16xbfloat_param_0]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_16xbfloat_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_16xbfloat_param_1]; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1065,12 +1065,12 @@ define void @shared_volatile_8xi32(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_8xi32_param_1]; -; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.volatile.shared.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xi32_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_8xi32_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(3) %a store volatile <8 x i32> %a.load, ptr addrspace(3) %b @@ -1083,12 +1083,12 @@ define void @shared_volatile_4xi64(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xi64_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd6, [shared_volatile_4xi64_param_1]; -; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.volatile.shared.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xi64_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [shared_volatile_4xi64_param_1]; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i64>, ptr addrspace(3) %a store volatile <4 x i64> %a.load, ptr addrspace(3) %b @@ -1102,12 +1102,12 @@ define void @shared_volatile_8xfloat(ptr addrspace(3) %a, ptr addrspace(3) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; CHECK-NEXT: st.volatile.shared.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_8xfloat_param_0]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.volatile.shared.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(3) %a store volatile <8 x float> %a.load, ptr addrspace(3) %b @@ -1121,12 +1121,12 @@ define void @shared_volatile_4xdouble(ptr addrspace(3) %a, ptr addrspace(3) %b) ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [shared_volatile_4xdouble_param_0]; -; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: ld.volatile.shared.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [shared_volatile_4xdouble_param_1]; -; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd2+16], {%fd3, %fd4}; -; CHECK-NEXT: st.volatile.shared.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ld.param.b64 %rd1, [shared_volatile_4xdouble_param_0]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.volatile.shared.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [shared_volatile_4xdouble_param_1]; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.volatile.shared.v2.b64 [%rd2], {%fd1, %fd2}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x double>, ptr addrspace(3) %a store volatile <4 x double> %a.load, ptr addrspace(3) %b @@ -1144,10 +1144,10 @@ define void @local_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_32xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_32xi8_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_32xi8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_32xi8_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1163,10 +1163,10 @@ define void @local_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_16xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_16xi16_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_16xi16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_16xi16_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1182,10 +1182,10 @@ define void @local_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_16xhalf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_16xhalf_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_16xhalf_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_16xhalf_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1201,10 +1201,10 @@ define void @local_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_16xbfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_16xbfloat_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_16xbfloat_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_16xbfloat_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1220,12 +1220,12 @@ define void @local_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_8xi32_param_0]; -; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_8xi32_param_1]; -; CHECK-NEXT: st.local.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_8xi32_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_8xi32_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %a.load = load <8 x i32>, ptr addrspace(5) %a store <8 x i32> %a.load, ptr addrspace(5) %b @@ -1238,12 +1238,12 @@ define void @local_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xi64_param_0]; -; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.local.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd6, [local_4xi64_param_1]; -; CHECK-NEXT: st.local.v2.u64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.local.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_4xi64_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [local_4xi64_param_1]; +; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load <4 x i64>, ptr addrspace(5) %a store <4 x i64> %a.load, ptr addrspace(5) %b @@ -1257,12 +1257,12 @@ define void @local_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_8xfloat_param_1]; -; CHECK-NEXT: st.local.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; CHECK-NEXT: st.local.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_8xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_8xfloat_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; CHECK-NEXT: ret; %a.load = load <8 x float>, ptr addrspace(5) %a store <8 x float> %a.load, ptr addrspace(5) %b @@ -1276,12 +1276,12 @@ define void @local_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_4xdouble_param_0]; -; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: ld.local.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_4xdouble_param_1]; -; CHECK-NEXT: st.local.v2.f64 [%rd2+16], {%fd3, %fd4}; -; CHECK-NEXT: st.local.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_4xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.local.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_4xdouble_param_1]; +; CHECK-NEXT: st.local.v2.b64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.local.v2.b64 [%rd2], {%fd1, %fd2}; ; CHECK-NEXT: ret; %a.load = load <4 x double>, ptr addrspace(5) %a store <4 x double> %a.load, ptr addrspace(5) %b @@ -1297,10 +1297,10 @@ define void @local_volatile_32xi8(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_32xi8_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_32xi8_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_32xi8_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_32xi8_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1316,10 +1316,10 @@ define void @local_volatile_16xi16(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xi16_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xi16_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_16xi16_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xi16_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1335,10 +1335,10 @@ define void @local_volatile_16xhalf(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xhalf_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xhalf_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_16xhalf_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xhalf_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1354,10 +1354,10 @@ define void @local_volatile_16xbfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_16xbfloat_param_0]; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_16xbfloat_param_0]; ; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; ; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_16xbfloat_param_1]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_16xbfloat_param_1]; ; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; ; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; @@ -1373,12 +1373,12 @@ define void @local_volatile_8xi32(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xi32_param_0]; -; CHECK-NEXT: ld.local.v4.u32 {%r1, %r2, %r3, %r4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.u32 {%r5, %r6, %r7, %r8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_8xi32_param_1]; -; CHECK-NEXT: st.local.v4.u32 [%rd2+16], {%r5, %r6, %r7, %r8}; -; CHECK-NEXT: st.local.v4.u32 [%rd2], {%r1, %r2, %r3, %r4}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xi32_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%r5, %r6, %r7, %r8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xi32_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%r5, %r6, %r7, %r8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%r1, %r2, %r3, %r4}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x i32>, ptr addrspace(5) %a store volatile <8 x i32> %a.load, ptr addrspace(5) %b @@ -1391,12 +1391,12 @@ define void @local_volatile_4xi64(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<7>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xi64_param_0]; -; CHECK-NEXT: ld.local.v2.u64 {%rd2, %rd3}, [%rd1]; -; CHECK-NEXT: ld.local.v2.u64 {%rd4, %rd5}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd6, [local_volatile_4xi64_param_1]; -; CHECK-NEXT: st.local.v2.u64 [%rd6+16], {%rd4, %rd5}; -; CHECK-NEXT: st.local.v2.u64 [%rd6], {%rd2, %rd3}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xi64_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%rd2, %rd3}, [%rd1]; +; CHECK-NEXT: ld.local.v2.b64 {%rd4, %rd5}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd6, [local_volatile_4xi64_param_1]; +; CHECK-NEXT: st.local.v2.b64 [%rd6+16], {%rd4, %rd5}; +; CHECK-NEXT: st.local.v2.b64 [%rd6], {%rd2, %rd3}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x i64>, ptr addrspace(5) %a store volatile <4 x i64> %a.load, ptr addrspace(5) %b @@ -1410,12 +1410,12 @@ define void @local_volatile_8xfloat(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %rd<3>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_8xfloat_param_0]; -; CHECK-NEXT: ld.local.v4.f32 {%f1, %f2, %f3, %f4}, [%rd1]; -; CHECK-NEXT: ld.local.v4.f32 {%f5, %f6, %f7, %f8}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_8xfloat_param_1]; -; CHECK-NEXT: st.local.v4.f32 [%rd2+16], {%f5, %f6, %f7, %f8}; -; CHECK-NEXT: st.local.v4.f32 [%rd2], {%f1, %f2, %f3, %f4}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_8xfloat_param_0]; +; CHECK-NEXT: ld.local.v4.b32 {%f1, %f2, %f3, %f4}, [%rd1]; +; CHECK-NEXT: ld.local.v4.b32 {%f5, %f6, %f7, %f8}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_8xfloat_param_1]; +; CHECK-NEXT: st.local.v4.b32 [%rd2+16], {%f5, %f6, %f7, %f8}; +; CHECK-NEXT: st.local.v4.b32 [%rd2], {%f1, %f2, %f3, %f4}; ; CHECK-NEXT: ret; %a.load = load volatile <8 x float>, ptr addrspace(5) %a store volatile <8 x float> %a.load, ptr addrspace(5) %b @@ -1429,12 +1429,12 @@ define void @local_volatile_4xdouble(ptr addrspace(5) %a, ptr addrspace(5) %b) { ; CHECK-NEXT: .reg .b64 %fd<5>; ; CHECK-EMPTY: ; CHECK-NEXT: // %bb.0: -; CHECK-NEXT: ld.param.u64 %rd1, [local_volatile_4xdouble_param_0]; -; CHECK-NEXT: ld.local.v2.f64 {%fd1, %fd2}, [%rd1]; -; CHECK-NEXT: ld.local.v2.f64 {%fd3, %fd4}, [%rd1+16]; -; CHECK-NEXT: ld.param.u64 %rd2, [local_volatile_4xdouble_param_1]; -; CHECK-NEXT: st.local.v2.f64 [%rd2+16], {%fd3, %fd4}; -; CHECK-NEXT: st.local.v2.f64 [%rd2], {%fd1, %fd2}; +; CHECK-NEXT: ld.param.b64 %rd1, [local_volatile_4xdouble_param_0]; +; CHECK-NEXT: ld.local.v2.b64 {%fd1, %fd2}, [%rd1]; +; CHECK-NEXT: ld.local.v2.b64 {%fd3, %fd4}, [%rd1+16]; +; CHECK-NEXT: ld.param.b64 %rd2, [local_volatile_4xdouble_param_1]; +; CHECK-NEXT: st.local.v2.b64 [%rd2+16], {%fd3, %fd4}; +; CHECK-NEXT: st.local.v2.b64 [%rd2], {%fd1, %fd2}; ; CHECK-NEXT: ret; %a.load = load volatile <4 x double>, ptr addrspace(5) %a store volatile <4 x double> %a.load, ptr addrspace(5) %b From adb299a9667bc942bd92187f909ef0cce3e871f1 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Mon, 12 May 2025 19:34:11 +0000 Subject: [PATCH 10/11] Update intirnsics to use untyped ld/st --- llvm/lib/Target/NVPTX/NVPTXIntrinsics.td | 8 +-- llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll | 16 ++--- .../load-store-256-addressing-invariant.ll | 64 +++++++++---------- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td index eb565e6219d69..8110ba1b2b37b 100644 --- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td +++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td @@ -2419,10 +2419,10 @@ def INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"b16", Int16Regs>; def INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"b32", Int32Regs>; def INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"b32", Float32Regs>; -def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"u64", Int64Regs>; -def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"f64", Float64Regs>; -def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"u32", Int32Regs>; -def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"f32", Float32Regs>; +def INT_PTX_LDG_G_v4i64_ELE : VLDG_G_ELE_V4<"b64", Int64Regs>; +def INT_PTX_LDG_G_v4f64_ELE : VLDG_G_ELE_V4<"b64", Float64Regs>; +def INT_PTX_LDG_G_v8i32_ELE : VLDG_G_ELE_V8<"b32", Int32Regs>; +def INT_PTX_LDG_G_v8f32_ELE : VLDG_G_ELE_V8<"b32", Float32Regs>; multiclass NG_TO_G Preds = []> { if Supports32 then diff --git a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll index fbb5c9ab5cf49..6fc698011dd42 100644 --- a/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll +++ b/llvm/test/CodeGen/NVPTX/ldg-invariant-256.ll @@ -54,7 +54,7 @@ define i8 @ld_global_v32i8(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v32i8_param_0]; -; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: bfe.u32 %r9, %r8, 0, 8; ; SM100-NEXT: cvt.u16.u32 %rs1, %r9; ; SM100-NEXT: bfe.u32 %r10, %r7, 0, 8; @@ -139,7 +139,7 @@ define i16 @ld_global_v16i16(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16i16_param_0]; -; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; ; SM100-NEXT: mov.b32 {%rs3, _}, %r6; @@ -214,7 +214,7 @@ define half @ld_global_v16f16(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16f16_param_0]; -; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; ; SM100-NEXT: mov.b32 {%rs3, _}, %r6; @@ -288,7 +288,7 @@ define bfloat @ld_global_v16bf16(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v16bf16_param_0]; -; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: mov.b32 {%rs1, _}, %r8; ; SM100-NEXT: mov.b32 {%rs2, _}, %r7; ; SM100-NEXT: mov.b32 {%rs3, _}, %r6; @@ -352,7 +352,7 @@ define i32 @ld_global_v8i32(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8i32_param_0]; -; SM100-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; SM100-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; SM100-NEXT: add.s32 %r9, %r1, %r2; ; SM100-NEXT: add.s32 %r10, %r3, %r4; ; SM100-NEXT: add.s32 %r11, %r5, %r6; @@ -409,7 +409,7 @@ define float @ld_global_v8f32(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v8f32_param_0]; -; SM100-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; SM100-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; ; SM100-NEXT: add.rn.f32 %f9, %f1, %f2; ; SM100-NEXT: add.rn.f32 %f10, %f3, %f4; ; SM100-NEXT: add.rn.f32 %f11, %f5, %f6; @@ -460,7 +460,7 @@ define i64 @ld_global_v4i64(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4i64_param_0]; -; SM100-NEXT: ld.global.nc.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; SM100-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; ; SM100-NEXT: add.s64 %rd6, %rd2, %rd3; ; SM100-NEXT: add.s64 %rd7, %rd4, %rd5; ; SM100-NEXT: add.s64 %rd8, %rd6, %rd7; @@ -500,7 +500,7 @@ define double @ld_global_v4f64(ptr addrspace(1) %ptr) { ; SM100-EMPTY: ; SM100-NEXT: // %bb.0: ; SM100-NEXT: ld.param.b64 %rd1, [ld_global_v4f64_param_0]; -; SM100-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; SM100-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; ; SM100-NEXT: add.rn.f64 %fd5, %fd1, %fd2; ; SM100-NEXT: add.rn.f64 %fd6, %fd3, %fd4; ; SM100-NEXT: add.rn.f64 %fd7, %fd5, %fd6; diff --git a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll index eed70d8b42c48..5d974cef0d475 100644 --- a/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll +++ b/llvm/test/CodeGen/NVPTX/load-store-256-addressing-invariant.ll @@ -29,7 +29,7 @@ define void @avar_i8() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <32 x i8>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -43,7 +43,7 @@ define void @avar_i16() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x i16>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -57,7 +57,7 @@ define void @avar_half() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x half>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -71,7 +71,7 @@ define void @avar_bfloat() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <16 x bfloat>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -85,7 +85,7 @@ define void @avar_i32() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin]; ; PTX-NEXT: st.global.v8.b32 [globalout], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %load = load <8 x i32>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -99,7 +99,7 @@ define void @avar_i64() { ; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin]; ; PTX-NEXT: st.global.v4.b64 [globalout], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %load = load <4 x i64>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -113,7 +113,7 @@ define void @avar_float() { ; PTX-NEXT: .reg .b32 %f<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; +; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin]; ; PTX-NEXT: st.global.v8.b32 [globalout], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %load = load <8 x float>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -127,7 +127,7 @@ define void @avar_double() { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; +; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin]; ; PTX-NEXT: st.global.v4.b64 [globalout], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %load = load <4 x double>, ptr addrspace(1) @globalin, !invariant.load !0 @@ -141,7 +141,7 @@ define void @asi_i8() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -157,7 +157,7 @@ define void @asi_i16() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -173,7 +173,7 @@ define void @asi_half() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -189,7 +189,7 @@ define void @asi_bfloat() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -205,7 +205,7 @@ define void @asi_i32() { ; PTX-NEXT: .reg .b32 %r<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [globalin+32]; ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -221,7 +221,7 @@ define void @asi_i64() { ; PTX-NEXT: .reg .b64 %rd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v4.u64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd1, %rd2, %rd3, %rd4}, [globalin+32]; ; PTX-NEXT: st.global.v4.b64 [globalout+32], {%rd1, %rd2, %rd3, %rd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -237,7 +237,7 @@ define void @asi_float() { ; PTX-NEXT: .reg .b32 %f<9>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [globalin+32]; ; PTX-NEXT: st.global.v8.b32 [globalout+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -253,7 +253,7 @@ define void @asi_double() { ; PTX-NEXT: .reg .b64 %fd<5>; ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: -; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; +; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [globalin+32]; ; PTX-NEXT: st.global.v4.b64 [globalout+32], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) @globalin, i32 32 @@ -271,7 +271,7 @@ define void @areg_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i8_param_0]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i8_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -287,7 +287,7 @@ define void @areg_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i16_param_0]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i16_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -303,7 +303,7 @@ define void @areg_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_half_param_0]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_half_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -319,7 +319,7 @@ define void @areg_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_bfloat_param_0]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_bfloat_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -336,7 +336,7 @@ define void @areg_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i32_param_0]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_i32_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; @@ -352,7 +352,7 @@ define void @areg_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_i64_param_0]; -; PTX-NEXT: ld.global.nc.v4.u64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd2, %rd3, %rd4, %rd5}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd6, [areg_64_i64_param_1]; ; PTX-NEXT: st.global.v4.b64 [%rd6], {%rd2, %rd3, %rd4, %rd5}; ; PTX-NEXT: ret; @@ -369,7 +369,7 @@ define void @areg_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_float_param_0]; -; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; +; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_float_param_1]; ; PTX-NEXT: st.global.v8.b32 [%rd2], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; @@ -386,7 +386,7 @@ define void @areg_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-EMPTY: ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [areg_64_double_param_0]; -; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; +; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1]; ; PTX-NEXT: ld.param.b64 %rd2, [areg_64_double_param_1]; ; PTX-NEXT: st.global.v4.b64 [%rd2], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; @@ -404,7 +404,7 @@ define void @ari_64_i8(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i8_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i8_param_1]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -423,7 +423,7 @@ define void @ari_64_i16(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i16_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i16_param_1]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -442,7 +442,7 @@ define void @ari_64_half(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_half_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_half_param_1]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -461,7 +461,7 @@ define void @ari_64_bfloat(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_bfloat_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_bfloat_param_1]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -480,7 +480,7 @@ define void @ari_64_i32(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i32_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i32_param_1]; -; PTX-NEXT: ld.global.nc.v8.u32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%r1, %r2, %r3, %r4, %r5, %r6, %r7, %r8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -498,7 +498,7 @@ define void @ari_64_i64(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_i64_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_i64_param_1]; -; PTX-NEXT: ld.global.nc.v4.u64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v4.b64 {%rd3, %rd4, %rd5, %rd6}, [%rd1+32]; ; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%rd3, %rd4, %rd5, %rd6}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -517,7 +517,7 @@ define void @ari_64_float(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_float_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_float_param_1]; -; PTX-NEXT: ld.global.nc.v8.f32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v8.b32 {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}, [%rd1+32]; ; PTX-NEXT: st.global.v8.b32 [%rd2+32], {%f1, %f2, %f3, %f4, %f5, %f6, %f7, %f8}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 @@ -536,7 +536,7 @@ define void @ari_64_double(ptr addrspace(1) %in, ptr addrspace(1) %out) { ; PTX-NEXT: // %bb.0: ; PTX-NEXT: ld.param.b64 %rd1, [ari_64_double_param_0]; ; PTX-NEXT: ld.param.b64 %rd2, [ari_64_double_param_1]; -; PTX-NEXT: ld.global.nc.v4.f64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; +; PTX-NEXT: ld.global.nc.v4.b64 {%fd1, %fd2, %fd3, %fd4}, [%rd1+32]; ; PTX-NEXT: st.global.v4.b64 [%rd2+32], {%fd1, %fd2, %fd3, %fd4}; ; PTX-NEXT: ret; %in.offset = getelementptr inbounds i8, ptr addrspace(1) %in, i32 32 From 65f3e735600750c6f364e3b9e7ddb24fcc9b4076 Mon Sep 17 00:00:00 2001 From: Drew Kersnar Date: Tue, 13 May 2025 14:22:34 +0000 Subject: [PATCH 11/11] Simplify op append --- llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp index 71d1da781ab8e..7d171cff7bcb4 100644 --- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp +++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp @@ -1460,10 +1460,8 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) { const unsigned TotalWidth = StoreVT.getSimpleVT().getSizeInBits(); unsigned NumElts = getLoadStoreVectorNumElts(N); - SmallVector Ops; - // Append the operands from 1 to NumElts, inclusive - const SDUse *FirstStoredVal = N->ops().begin() + 1; - Ops.append(FirstStoredVal, FirstStoredVal + NumElts); + + SmallVector Ops(N->ops().slice(1, NumElts)); SDValue N2 = N->getOperand(NumElts + 1); unsigned ToTypeWidth = TotalWidth / NumElts;