-
Notifications
You must be signed in to change notification settings - Fork 15.4k
[NVPTX] Switch to imm offset variants for LDG and LDU #128270
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[NVPTX] Switch to imm offset variants for LDG and LDU #128270
Conversation
|
@llvm/pr-subscribers-backend-nvptx Author: Alex MacLean (AlexMaclean) ChangesPatch is 23.12 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/128270.diff 3 Files Affected:
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 6f0bf510ad893..971a128aadfdb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
break;
}
- if (!Opcode)
- return false;
- Ops.append({Base, Offset, Chain});
} else {
if (PointerSize == 64) {
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
break;
}
}
- if (!Opcode)
- return false;
- Ops.append({Base, Offset, Chain});
}
+ if (!Opcode)
+ return false;
+ Ops.append({Base, Offset, Chain});
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
std::optional<unsigned> Opcode;
SDLoc DL(N);
SDNode *LD;
- SDValue Base, Offset, Addr;
+ SDValue Base, Offset;
- if (SelectDirectAddr(Op1, Addr)) {
+ if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
switch (N->getOpcode()) {
default:
return false;
case ISD::LOAD:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
- NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
- NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
- NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
+ NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
+ NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
+ NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
break;
case ISD::INTRINSIC_W_CHAIN:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
- NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
- NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
- NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
+ NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
+ NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
+ NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
break;
case NVPTXISD::LoadV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
- NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
+ NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
break;
case NVPTXISD::LDUV2:
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
- NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
+ NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
break;
case NVPTXISD::LoadV4:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
- NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
- NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
+ NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
+ NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
break;
case NVPTXISD::LDUV4:
Opcode = pickOpcodeForVT(
- EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
- NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
- NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
+ EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
+ NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
+ NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
break;
}
- if (!Opcode)
- return false;
- SDValue Ops[] = { Addr, Chain };
- LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
} else {
if (TM.is64Bit()) {
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
break;
}
}
- if (!Opcode)
- return false;
- SDValue Ops[] = {Base, Offset, Chain};
- LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
}
+ if (!Opcode)
+ return false;
+ SDValue Ops[] = {Base, Offset, Chain};
+ LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
// For automatic generation of LDG (through SelectLoad[Vector], not the
// intrinsics), we may have an extending load like:
@@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
break;
}
- Ops.append({Base, Offset});
} else {
if (PointerSize == 64) {
SelectADDRri64(N2.getNode(), N2, Base, Offset);
@@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
break;
}
}
- Ops.append({Base, Offset});
}
if (!Opcode)
return false;
-
- Ops.push_back(Chain);
+ Ops.append({Base, Offset, Chain});
ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index acb9fc9867b0f..eca2397ff3f26 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2693,23 +2693,23 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
// Scalar
multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
- def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
- !strconcat("ldu.global.", TyStr),
+ def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+ "ldu.global." # TyStr # " \t$result, [$src$offset];",
[]>, Requires<[hasLDU]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
- !strconcat("ldu.global.", TyStr),
+ "ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr),
+ "ldu.global." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDU]>;
}
-defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
+defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
+defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
+defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
+defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
// vector
@@ -2717,56 +2717,40 @@ defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
- !strconcat("ldu.global.", TyStr), []>;
+ "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
- (ins imemAny:$src),
- !strconcat("ldu.global.", TyStr), []>;
+ "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src, Offseti32imm:$offset),
+ "ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
}
multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
- !strconcat("ldu.global.", TyStr), []>;
+ "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri64:$src),
- !strconcat("ldu.global.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins imemAny:$src),
- !strconcat("ldu.global.", TyStr), []>;
-}
-
-defm INT_PTX_LDU_G_v2i8_ELE
- : VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v2i16_ELE
- : VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v2i32_ELE
- : VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDU_G_v2f32_ELE
- : VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDU_G_v2i64_ELE
- : VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDU_G_v2f64_ELE
- : VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDU_G_v4i8_ELE
- : VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDU_G_v4i16_ELE
- : VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int16Regs>;
-defm INT_PTX_LDU_G_v4i32_ELE
- : VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int32Regs>;
-defm INT_PTX_LDU_G_v4f16_ELE
- : VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int16Regs>;
-defm INT_PTX_LDU_G_v4f16x2_ELE
- : VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Int32Regs>;
-defm INT_PTX_LDU_G_v4f32_ELE
- : VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
- Float32Regs>;
+ "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+ "ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
+}
+
+defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
+defm INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
+defm INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;
//-----------------------------------
@@ -2778,29 +2762,23 @@ defm INT_PTX_LDU_G_v4f32_ELE
// during the lifetime of the kernel.
multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
- def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
- !strconcat("ld.global.nc.", TyStr),
+ def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
+ "ld.global.nc." # TyStr # " \t$result, [$src$offset];",
[]>, Requires<[hasLDG]>;
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr),
+ "ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr),
+ "ld.global.nc." # TyStr # " \t$result, [$src];",
[]>, Requires<[hasLDG]>;
}
-defm INT_PTX_LDG_GLOBAL_i8
- : LDG_G<"u8 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i16
- : LDG_G<"u16 \t$result, [$src];", Int16Regs>;
-defm INT_PTX_LDG_GLOBAL_i32
- : LDG_G<"u32 \t$result, [$src];", Int32Regs>;
-defm INT_PTX_LDG_GLOBAL_i64
- : LDG_G<"u64 \t$result, [$src];", Int64Regs>;
-defm INT_PTX_LDG_GLOBAL_f32
- : LDG_G<"f32 \t$result, [$src];", Float32Regs>;
-defm INT_PTX_LDG_GLOBAL_f64
- : LDG_G<"f64 \t$result, [$src];", Float64Regs>;
+defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
+defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
+defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
+defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
+defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
// vector
@@ -2808,54 +2786,39 @@ defm INT_PTX_LDG_GLOBAL_f64
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
(ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
- (ins imemAny:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
+ (ins imemAny:$src, Offseti32imm:$offset),
+ "ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
}
multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
- def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int32Regs:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
- def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins Int64Regs:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
regclass:$dst4), (ins MEMri64:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
- def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
- regclass:$dst4), (ins imemAny:$src),
- !strconcat("ld.global.nc.", TyStr), []>;
+ "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
+ def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
+ regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
+ "ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
}
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
-defm INT_PTX_LDG_G_v2i8_ELE
- : VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v2i16_ELE
- : VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v2i32_ELE
- : VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v2f32_ELE
- : VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
-defm INT_PTX_LDG_G_v2i64_ELE
- : VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
-defm INT_PTX_LDG_G_v2f64_ELE
- : VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
-defm INT_PTX_LDG_G_v4i8_ELE
- : VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i16_ELE
- : VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
-defm INT_PTX_LDG_G_v4i32_ELE
- : VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
-defm INT_PTX_LDG_G_v4f32_ELE
- : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
+defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
+defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
+defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
+
+defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
+defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
+defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
+defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
multiclass NG_TO_G<string Str> {
diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 4d4db21c6ed0d..377528b94f505 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -214,34 +214,33 @@ define dso_local i32 @bar() {
; CHECK-PTX-NEXT: .reg .b64 %SPL;
; CHECK-PTX-NEXT: .reg .b16 %rs<8>;
; CHECK-PTX-NEXT: .reg .b32 %r<4>;
-; CHECK-PTX-NEXT: .reg .b64 %rd<6>;
+; CHECK-PTX-NEXT: .reg .b64 %rd<5>;
; CHECK-PTX-EMPTY:
; CHECK-PTX-NEXT: // %bb.0: // %entry
; CHECK-PTX-NEXT: mov.u64 %SPL, __local_depot3;
; CHECK-PTX-NEXT: cvta.local.u64 %SP, %SPL;
; CHECK-PTX-NEXT: add.u64 %rd2, %SPL, 0;
-; CHECK-PTX-NEXT: mov.u64 %rd3, __const_$_bar_$_s1;
-; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [%rd3+7];
+; CHECK-PTX-NEXT: ld.global.nc.u8 %rs1, [__const_$_bar_$_s1+7];
; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1;
; CHECK-PTX-NEXT: st.local.u8 [%rd2+2], %rs2;
-; CHECK-PTX-NEXT: ld.global.nc...
[truncated]
|
Artem-B
left a comment
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Nice. We could use more tests, but LGTM otherwise.
| ; CHECK-PTX-NEXT: cvt.u16.u8 %rs2, %rs1; | ||
| ; CHECK-PTX-NEXT: st.local.u8 [%rd2+2], %rs2; | ||
| ; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [%rd3+6]; | ||
| ; CHECK-PTX-NEXT: ld.global.nc.u8 %rs3, [__const_$_bar_$_s1+6]; |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Given that the changes in this test file are largely incidental, we may want to add a few test cases to load/store tests that will exercise accessing data with ldu/ldg via symbol+offset.
llvm/test/CodeGen/NVPTX/ldu-ldg.ll
Outdated
|
|
||
| @g = addrspace(1) global i32 0 | ||
|
|
||
| ; CHECK-LABEL: test_ldg_asi |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Let's automate the check generation while we're here, too.
No description provided.