Skip to content

Conversation

@AlexMaclean
Copy link
Member

No description provided.

@AlexMaclean AlexMaclean requested a review from Artem-B February 22, 2025 02:05
@AlexMaclean AlexMaclean self-assigned this Feb 22, 2025
@llvmbot
Copy link
Member

llvmbot commented Feb 22, 2025

@llvm/pr-subscribers-backend-nvptx

Author: Alex MacLean (AlexMaclean)

Changes

Patch is 23.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128270.diff

3 Files Affected:

  • (modified) llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp (+38-48)
  • (modified) llvm/lib/Target/NVPTX/NVPTXIntrinsics.td (+66-103)
  • (modified) llvm/test/CodeGen/NVPTX/variadics-backend.ll (+21-23)
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6f0bf510ad893..971a128aadfdb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
                           std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
       break;
     }
-    if (!Opcode)
-      return false;
-    Ops.append({Base, Offset, Chain});
   } else {
     if (PointerSize == 64) {
       SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
         break;
       }
     }
-    if (!Opcode)
-      return false;
-    Ops.append({Base, Offset, Chain});
   }
+  if (!Opcode)
+    return false;
+  Ops.append({Base, Offset, Chain});
   LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
 
   MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
   std::optional<unsigned> Opcode;
   SDLoc DL(N);
   SDNode *LD;
-  SDValue Base, Offset, Addr;
+  SDValue Base, Offset;
 
-  if (SelectDirectAddr(Op1, Addr)) {
+  if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
     switch (N->getOpcode()) {
     default:
       return false;
     case ISD::LOAD:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
-          NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
-          NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
-          NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
+          NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
+          NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
+          NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
       break;
     case ISD::INTRINSIC_W_CHAIN:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
-          NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
-          NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
-          NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
+          NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
+          NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
+          NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
       break;
     case NVPTXISD::LoadV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
-                               NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
+                               NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
+                               NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
       break;
     case NVPTXISD::LDUV2:
       Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
-                               NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
-                               NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
+                               NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
+                               NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
       break;
     case NVPTXISD::LoadV4:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
-          NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
-          NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
-          NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
+          NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
+          NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
+          NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
       break;
     case NVPTXISD::LDUV4:
       Opcode = pickOpcodeForVT(
-          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
-          NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
-          NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
-          NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
+          EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
+          NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
+          NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
+          NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
       break;
     }
-    if (!Opcode)
-      return false;
-    SDValue Ops[] = { Addr, Chain };
-    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   } else {
     if (TM.is64Bit()) {
       SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
         break;
       }
     }
-    if (!Opcode)
-      return false;
-    SDValue Ops[] = {Base, Offset, Chain};
-    LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
   }
+  if (!Opcode)
+    return false;
+  SDValue Ops[] = {Base, Offset, Chain};
+  LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
 
   // For automatic generation of LDG (through SelectLoad[Vector], not the
   // intrinsics), we may have an extending load like:
@@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
                           std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
       break;
     }
-    Ops.append({Base, Offset});
   } else {
     if (PointerSize == 64) {
       SelectADDRri64(N2.getNode(), N2, Base, Offset);
@@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
         break;
       }
     }
-    Ops.append({Base, Offset});
   }
   if (!Opcode)
     return false;
-
-  Ops.push_back(Chain);
+  Ops.append({Base, Offset, Chain});
 
   ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index acb9fc9867b0f..eca2397ff3f26 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2693,23 +2693,23 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
 // Scalar
 
 multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
- def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
-               !strconcat("ldu.global.", TyStr),
+ def asi:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+               "ldu.global." # TyStr # " \t$result, [$src$offset];",
                       []>, Requires<[hasLDU]>;
  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
-               !strconcat("ldu.global.", TyStr),
+               "ldu.global." # TyStr # " \t$result, [$src];",
                       []>, Requires<[hasLDU]>;
  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
-               !strconcat("ldu.global.", TyStr),
+               "ldu.global." # TyStr # " \t$result, [$src];",
                         []>, Requires<[hasLDU]>;
 }
 
-defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_i8  : LDU_G<"u8", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
 
 // vector
 
@@ -2717,56 +2717,40 @@ defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
 multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri:$src),
-                     !strconcat("ldu.global.", TyStr), []>;
+                     "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri64:$src),
-                     !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-                     (ins imemAny:$src),
-                     !strconcat("ldu.global.", TyStr), []>;
+                     "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src, Offseti32imm:$offset),
+                     "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
 }
 
 multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                             regclass:$dst4), (ins MEMri:$src),
-               !strconcat("ldu.global.", TyStr), []>;
+               "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                             regclass:$dst4), (ins MEMri64:$src),
-               !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                            regclass:$dst4), (ins imemAny:$src),
-               !strconcat("ldu.global.", TyStr), []>;
-}
-
-defm INT_PTX_LDU_G_v2i8_ELE
-  : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
-defm INT_PTX_LDU_G_v2i16_ELE
-  : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v2i32_ELE
-  : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDU_G_v2f32_ELE
-  : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDU_G_v2i64_ELE
-  : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDU_G_v2f64_ELE
-  : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDU_G_v4i8_ELE
-  : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v4i16_ELE
-  : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int16Regs>;
-defm INT_PTX_LDU_G_v4i32_ELE
-  : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int32Regs>;
-defm INT_PTX_LDU_G_v4f16_ELE
-  : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int16Regs>;
-defm INT_PTX_LDU_G_v4f16x2_ELE
-  : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Int32Regs>;
-defm INT_PTX_LDU_G_v4f32_ELE
-  : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
-    Float32Regs>;
+               "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                            regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+               "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE  : VLDU_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE   : VLDU_G_ELE_V4<"b16", Int16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE  : VLDU_G_ELE_V4<"b32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE  : VLDU_G_ELE_V4<"f32", Float32Regs>;
 
 
 //-----------------------------------
@@ -2778,29 +2762,23 @@ defm INT_PTX_LDU_G_v4f32_ELE
 // during the lifetime of the kernel.
 
 multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
- def avar:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
-               !strconcat("ld.global.nc.", TyStr),
+ def asi:  NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+               "ld.global.nc." # TyStr # " \t$result, [$src$offset];",
                       []>, Requires<[hasLDG]>;
  def ari :  NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
-               !strconcat("ld.global.nc.", TyStr),
+               "ld.global.nc." # TyStr # " \t$result, [$src];",
                       []>, Requires<[hasLDG]>;
  def ari64 :  NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
-               !strconcat("ld.global.nc.", TyStr),
+               "ld.global.nc." # TyStr # " \t$result, [$src];",
                         []>, Requires<[hasLDG]>;
 }
 
-defm INT_PTX_LDG_GLOBAL_i8
-  : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i16
-  : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i32
-  : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDG_GLOBAL_i64
-  : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDG_GLOBAL_f32
-  : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDG_GLOBAL_f64
-  : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
 
 // vector
 
@@ -2808,54 +2786,39 @@ defm INT_PTX_LDG_GLOBAL_f64
 multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
  def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri:$src),
-                     !strconcat("ld.global.nc.", TyStr), []>;
+                     "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
  def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
                      (ins MEMri64:$src),
-                     !strconcat("ld.global.nc.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
-                     (ins imemAny:$src),
-                     !strconcat("ld.global.nc.", TyStr), []>;
+                     "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+                     (ins imemAny:$src, Offseti32imm:$offset),
+                     "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
 }
 
 multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
-  def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                              regclass:$dst4), (ins Int32Regs:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
-  def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                               regclass:$dst4), (ins Int64Regs:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
   def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                               regclass:$dst4), (ins MEMri:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
+               "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
   def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
                               regclass:$dst4), (ins MEMri64:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
-  def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
-                             regclass:$dst4), (ins imemAny:$src),
-               !strconcat("ld.global.nc.", TyStr), []>;
+               "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+  def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+                             regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+               "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
 }
 
 // FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-defm INT_PTX_LDG_G_v2i8_ELE
-  : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];",  Int16Regs>;
-defm INT_PTX_LDG_G_v2i16_ELE
-  : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v2i32_ELE
-  : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v2f32_ELE
-  : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDG_G_v2i64_ELE
-  : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDG_G_v2f64_ELE
-  : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDG_G_v4i8_ELE
-  : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i16_ELE
-  : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i32_ELE
-  : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v4f32_ELE
-  : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
 
 
 multiclass NG_TO_G<string Str> {
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 4d4db21c6ed0d..377528b94f505 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -214,34 +214,33 @@ define dso_local i32 @bar() {
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<8>;
 ; CHECK-PTX-NEXT:    .reg .b32 %r<4>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<6>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot3;
 ; CHECK-PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; CHECK-PTX-NEXT:    add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT:    mov.u64 %rd3, __const_$_bar_$_s1;
-; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [%rd3+7];
+; CHECK-PTX-NEXT:    ld.global.nc.u8 %rs1, [__const_$_bar_$_s1+7];
 ; CHECK-PTX-NEXT:    cvt.u16.u8 %rs2, %rs1;
 ; CHECK-PTX-NEXT:    st.local.u8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT:    ld.global.nc...
[truncated]

Copy link
Member

@Artem-B Artem-B left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice. We could use more tests, but LGTM otherwise.

; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
; CHECK-PTX-NEXT: st.local.u8 [%rd2+2], %rs2;
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd3+6];
; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [__const_$_bar_$_s1+6];
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Given that the changes in this test file are largely incidental, we may want to add a few test cases to load/store tests that will exercise accessing data with ldu/ldg via symbol+offset.


@g = addrspace(1) global i32 0

; CHECK-LABEL: test_ldg_asi
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's automate the check generation while we're here, too.

@AlexMaclean AlexMaclean merged commit d3dae84 into llvm:main Feb 25, 2025
9 of 11 checks passed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants