Skip to content

Commit 391e451

Browse files
committed
[NVPTX] Switch to imm offset variants for LDG and LDU
1 parent 5f8da7e commit 391e451

File tree

3 files changed

+125
-174
lines changed

3 files changed

+125
-174
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 38 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1106,9 +1106,6 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11061106
std::nullopt, NVPTX::LDV_f32_v4_asi, std::nullopt);
11071107
break;
11081108
}
1109-
if (!Opcode)
1110-
return false;
1111-
Ops.append({Base, Offset, Chain});
11121109
} else {
11131110
if (PointerSize == 64) {
11141111
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1148,10 +1145,10 @@ bool NVPTXDAGToDAGISel::tryLoadVector(SDNode *N) {
11481145
break;
11491146
}
11501147
}
1151-
if (!Opcode)
1152-
return false;
1153-
Ops.append({Base, Offset, Chain});
11541148
}
1149+
if (!Opcode)
1150+
return false;
1151+
Ops.append({Base, Offset, Chain});
11551152
LD = CurDAG->getMachineNode(*Opcode, DL, N->getVTList(), Ops);
11561153

11571154
MachineMemOperand *MemRef = cast<MemSDNode>(N)->getMemOperand();
@@ -1202,63 +1199,59 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
12021199
std::optional<unsigned> Opcode;
12031200
SDLoc DL(N);
12041201
SDNode *LD;
1205-
SDValue Base, Offset, Addr;
1202+
SDValue Base, Offset;
12061203

1207-
if (SelectDirectAddr(Op1, Addr)) {
1204+
if (SelectADDRsi(Op1.getNode(), Op1, Base, Offset)) {
12081205
switch (N->getOpcode()) {
12091206
default:
12101207
return false;
12111208
case ISD::LOAD:
12121209
Opcode = pickOpcodeForVT(
1213-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8avar,
1214-
NVPTX::INT_PTX_LDG_GLOBAL_i16avar, NVPTX::INT_PTX_LDG_GLOBAL_i32avar,
1215-
NVPTX::INT_PTX_LDG_GLOBAL_i64avar, NVPTX::INT_PTX_LDG_GLOBAL_f32avar,
1216-
NVPTX::INT_PTX_LDG_GLOBAL_f64avar);
1210+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_GLOBAL_i8asi,
1211+
NVPTX::INT_PTX_LDG_GLOBAL_i16asi, NVPTX::INT_PTX_LDG_GLOBAL_i32asi,
1212+
NVPTX::INT_PTX_LDG_GLOBAL_i64asi, NVPTX::INT_PTX_LDG_GLOBAL_f32asi,
1213+
NVPTX::INT_PTX_LDG_GLOBAL_f64asi);
12171214
break;
12181215
case ISD::INTRINSIC_W_CHAIN:
12191216
Opcode = pickOpcodeForVT(
1220-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8avar,
1221-
NVPTX::INT_PTX_LDU_GLOBAL_i16avar, NVPTX::INT_PTX_LDU_GLOBAL_i32avar,
1222-
NVPTX::INT_PTX_LDU_GLOBAL_i64avar, NVPTX::INT_PTX_LDU_GLOBAL_f32avar,
1223-
NVPTX::INT_PTX_LDU_GLOBAL_f64avar);
1217+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_GLOBAL_i8asi,
1218+
NVPTX::INT_PTX_LDU_GLOBAL_i16asi, NVPTX::INT_PTX_LDU_GLOBAL_i32asi,
1219+
NVPTX::INT_PTX_LDU_GLOBAL_i64asi, NVPTX::INT_PTX_LDU_GLOBAL_f32asi,
1220+
NVPTX::INT_PTX_LDU_GLOBAL_f64asi);
12241221
break;
12251222
case NVPTXISD::LoadV2:
12261223
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1227-
NVPTX::INT_PTX_LDG_G_v2i8_ELE_avar,
1228-
NVPTX::INT_PTX_LDG_G_v2i16_ELE_avar,
1229-
NVPTX::INT_PTX_LDG_G_v2i32_ELE_avar,
1230-
NVPTX::INT_PTX_LDG_G_v2i64_ELE_avar,
1231-
NVPTX::INT_PTX_LDG_G_v2f32_ELE_avar,
1232-
NVPTX::INT_PTX_LDG_G_v2f64_ELE_avar);
1224+
NVPTX::INT_PTX_LDG_G_v2i8_ELE_asi,
1225+
NVPTX::INT_PTX_LDG_G_v2i16_ELE_asi,
1226+
NVPTX::INT_PTX_LDG_G_v2i32_ELE_asi,
1227+
NVPTX::INT_PTX_LDG_G_v2i64_ELE_asi,
1228+
NVPTX::INT_PTX_LDG_G_v2f32_ELE_asi,
1229+
NVPTX::INT_PTX_LDG_G_v2f64_ELE_asi);
12331230
break;
12341231
case NVPTXISD::LDUV2:
12351232
Opcode = pickOpcodeForVT(EltVT.getSimpleVT().SimpleTy,
1236-
NVPTX::INT_PTX_LDU_G_v2i8_ELE_avar,
1237-
NVPTX::INT_PTX_LDU_G_v2i16_ELE_avar,
1238-
NVPTX::INT_PTX_LDU_G_v2i32_ELE_avar,
1239-
NVPTX::INT_PTX_LDU_G_v2i64_ELE_avar,
1240-
NVPTX::INT_PTX_LDU_G_v2f32_ELE_avar,
1241-
NVPTX::INT_PTX_LDU_G_v2f64_ELE_avar);
1233+
NVPTX::INT_PTX_LDU_G_v2i8_ELE_asi,
1234+
NVPTX::INT_PTX_LDU_G_v2i16_ELE_asi,
1235+
NVPTX::INT_PTX_LDU_G_v2i32_ELE_asi,
1236+
NVPTX::INT_PTX_LDU_G_v2i64_ELE_asi,
1237+
NVPTX::INT_PTX_LDU_G_v2f32_ELE_asi,
1238+
NVPTX::INT_PTX_LDU_G_v2f64_ELE_asi);
12421239
break;
12431240
case NVPTXISD::LoadV4:
12441241
Opcode = pickOpcodeForVT(
1245-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_avar,
1246-
NVPTX::INT_PTX_LDG_G_v4i16_ELE_avar,
1247-
NVPTX::INT_PTX_LDG_G_v4i32_ELE_avar, std::nullopt,
1248-
NVPTX::INT_PTX_LDG_G_v4f32_ELE_avar, std::nullopt);
1242+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDG_G_v4i8_ELE_asi,
1243+
NVPTX::INT_PTX_LDG_G_v4i16_ELE_asi,
1244+
NVPTX::INT_PTX_LDG_G_v4i32_ELE_asi, std::nullopt,
1245+
NVPTX::INT_PTX_LDG_G_v4f32_ELE_asi, std::nullopt);
12491246
break;
12501247
case NVPTXISD::LDUV4:
12511248
Opcode = pickOpcodeForVT(
1252-
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_avar,
1253-
NVPTX::INT_PTX_LDU_G_v4i16_ELE_avar,
1254-
NVPTX::INT_PTX_LDU_G_v4i32_ELE_avar, std::nullopt,
1255-
NVPTX::INT_PTX_LDU_G_v4f32_ELE_avar, std::nullopt);
1249+
EltVT.getSimpleVT().SimpleTy, NVPTX::INT_PTX_LDU_G_v4i8_ELE_asi,
1250+
NVPTX::INT_PTX_LDU_G_v4i16_ELE_asi,
1251+
NVPTX::INT_PTX_LDU_G_v4i32_ELE_asi, std::nullopt,
1252+
NVPTX::INT_PTX_LDU_G_v4f32_ELE_asi, std::nullopt);
12561253
break;
12571254
}
1258-
if (!Opcode)
1259-
return false;
1260-
SDValue Ops[] = { Addr, Chain };
1261-
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
12621255
} else {
12631256
if (TM.is64Bit()) {
12641257
SelectADDRri64(Op1.getNode(), Op1, Base, Offset);
@@ -1369,11 +1362,11 @@ bool NVPTXDAGToDAGISel::tryLDGLDU(SDNode *N) {
13691362
break;
13701363
}
13711364
}
1372-
if (!Opcode)
1373-
return false;
1374-
SDValue Ops[] = {Base, Offset, Chain};
1375-
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
13761365
}
1366+
if (!Opcode)
1367+
return false;
1368+
SDValue Ops[] = {Base, Offset, Chain};
1369+
LD = CurDAG->getMachineNode(*Opcode, DL, InstVTList, Ops);
13771370

13781371
// For automatic generation of LDG (through SelectLoad[Vector], not the
13791372
// intrinsics), we may have an extending load like:
@@ -1577,7 +1570,6 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
15771570
std::nullopt, NVPTX::STV_f32_v4_asi, std::nullopt);
15781571
break;
15791572
}
1580-
Ops.append({Base, Offset});
15811573
} else {
15821574
if (PointerSize == 64) {
15831575
SelectADDRri64(N2.getNode(), N2, Base, Offset);
@@ -1617,12 +1609,10 @@ bool NVPTXDAGToDAGISel::tryStoreVector(SDNode *N) {
16171609
break;
16181610
}
16191611
}
1620-
Ops.append({Base, Offset});
16211612
}
16221613
if (!Opcode)
16231614
return false;
1624-
1625-
Ops.push_back(Chain);
1615+
Ops.append({Base, Offset, Chain});
16261616

16271617
ST = CurDAG->getMachineNode(*Opcode, DL, MVT::Other, Ops);
16281618

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 66 additions & 103 deletions
Original file line numberDiff line numberDiff line change
@@ -2693,80 +2693,64 @@ defm INT_PTX_SATOM_XOR : ATOM2_bitwise_impl<"xor">;
26932693
// Scalar
26942694

26952695
multiclass LDU_G<string TyStr, NVPTXRegClass regclass> {
2696-
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
2697-
!strconcat("ldu.global.", TyStr),
2696+
def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
2697+
"ldu.global." # TyStr # " \t$result, [$src$offset];",
26982698
[]>, Requires<[hasLDU]>;
26992699
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
2700-
!strconcat("ldu.global.", TyStr),
2700+
"ldu.global." # TyStr # " \t$result, [$src];",
27012701
[]>, Requires<[hasLDU]>;
27022702
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
2703-
!strconcat("ldu.global.", TyStr),
2703+
"ldu.global." # TyStr # " \t$result, [$src];",
27042704
[]>, Requires<[hasLDU]>;
27052705
}
27062706

2707-
defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8 \t$result, [$src];", Int16Regs>;
2708-
defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16 \t$result, [$src];", Int16Regs>;
2709-
defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32 \t$result, [$src];", Int32Regs>;
2710-
defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64 \t$result, [$src];", Int64Regs>;
2711-
defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32 \t$result, [$src];", Float32Regs>;
2712-
defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64 \t$result, [$src];", Float64Regs>;
2707+
defm INT_PTX_LDU_GLOBAL_i8 : LDU_G<"u8", Int16Regs>;
2708+
defm INT_PTX_LDU_GLOBAL_i16 : LDU_G<"u16", Int16Regs>;
2709+
defm INT_PTX_LDU_GLOBAL_i32 : LDU_G<"u32", Int32Regs>;
2710+
defm INT_PTX_LDU_GLOBAL_i64 : LDU_G<"u64", Int64Regs>;
2711+
defm INT_PTX_LDU_GLOBAL_f32 : LDU_G<"f32", Float32Regs>;
2712+
defm INT_PTX_LDU_GLOBAL_f64 : LDU_G<"f64", Float64Regs>;
27132713

27142714
// vector
27152715

27162716
// Elementized vector ldu
27172717
multiclass VLDU_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
27182718
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
27192719
(ins MEMri:$src),
2720-
!strconcat("ldu.global.", TyStr), []>;
2720+
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
27212721
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
27222722
(ins MEMri64:$src),
2723-
!strconcat("ldu.global.", TyStr), []>;
2724-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2725-
(ins imemAny:$src),
2726-
!strconcat("ldu.global.", TyStr), []>;
2723+
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
2724+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2725+
(ins imemAny:$src, Offseti32imm:$offset),
2726+
"ldu.global.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
27272727
}
27282728

27292729
multiclass VLDU_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
27302730
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
27312731
regclass:$dst4), (ins MEMri:$src),
2732-
!strconcat("ldu.global.", TyStr), []>;
2732+
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
27332733
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
27342734
regclass:$dst4), (ins MEMri64:$src),
2735-
!strconcat("ldu.global.", TyStr), []>;
2736-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2737-
regclass:$dst4), (ins imemAny:$src),
2738-
!strconcat("ldu.global.", TyStr), []>;
2739-
}
2740-
2741-
defm INT_PTX_LDU_G_v2i8_ELE
2742-
: VLDU_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2743-
defm INT_PTX_LDU_G_v2i16_ELE
2744-
: VLDU_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2745-
defm INT_PTX_LDU_G_v2i32_ELE
2746-
: VLDU_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
2747-
defm INT_PTX_LDU_G_v2f32_ELE
2748-
: VLDU_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
2749-
defm INT_PTX_LDU_G_v2i64_ELE
2750-
: VLDU_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
2751-
defm INT_PTX_LDU_G_v2f64_ELE
2752-
: VLDU_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
2753-
defm INT_PTX_LDU_G_v4i8_ELE
2754-
: VLDU_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2755-
defm INT_PTX_LDU_G_v4i16_ELE
2756-
: VLDU_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2757-
Int16Regs>;
2758-
defm INT_PTX_LDU_G_v4i32_ELE
2759-
: VLDU_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2760-
Int32Regs>;
2761-
defm INT_PTX_LDU_G_v4f16_ELE
2762-
: VLDU_G_ELE_V4<"v4.b16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2763-
Int16Regs>;
2764-
defm INT_PTX_LDU_G_v4f16x2_ELE
2765-
: VLDU_G_ELE_V4<"v4.b32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2766-
Int32Regs>;
2767-
defm INT_PTX_LDU_G_v4f32_ELE
2768-
: VLDU_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];",
2769-
Float32Regs>;
2735+
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
2736+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2737+
regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
2738+
"ldu.global.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
2739+
}
2740+
2741+
defm INT_PTX_LDU_G_v2i8_ELE : VLDU_G_ELE_V2<"u8", Int16Regs>;
2742+
defm INT_PTX_LDU_G_v2i16_ELE : VLDU_G_ELE_V2<"u16", Int16Regs>;
2743+
defm INT_PTX_LDU_G_v2i32_ELE : VLDU_G_ELE_V2<"u32", Int32Regs>;
2744+
defm INT_PTX_LDU_G_v2f32_ELE : VLDU_G_ELE_V2<"f32", Float32Regs>;
2745+
defm INT_PTX_LDU_G_v2i64_ELE : VLDU_G_ELE_V2<"u64", Int64Regs>;
2746+
defm INT_PTX_LDU_G_v2f64_ELE : VLDU_G_ELE_V2<"f64", Float64Regs>;
2747+
2748+
defm INT_PTX_LDU_G_v4i8_ELE : VLDU_G_ELE_V4<"u8", Int16Regs>;
2749+
defm INT_PTX_LDU_G_v4i16_ELE : VLDU_G_ELE_V4<"u16", Int16Regs>;
2750+
defm INT_PTX_LDU_G_v4i32_ELE : VLDU_G_ELE_V4<"u32", Int32Regs>;
2751+
defm INT_PTX_LDU_G_v4f16_ELE : VLDU_G_ELE_V4<"b16", Int16Regs>;
2752+
defm INT_PTX_LDU_G_v4f16x2_ELE : VLDU_G_ELE_V4<"b32", Int32Regs>;
2753+
defm INT_PTX_LDU_G_v4f32_ELE : VLDU_G_ELE_V4<"f32", Float32Regs>;
27702754

27712755

27722756
//-----------------------------------
@@ -2778,84 +2762,63 @@ defm INT_PTX_LDU_G_v4f32_ELE
27782762
// during the lifetime of the kernel.
27792763

27802764
multiclass LDG_G<string TyStr, NVPTXRegClass regclass> {
2781-
def avar: NVPTXInst<(outs regclass:$result), (ins imemAny:$src),
2782-
!strconcat("ld.global.nc.", TyStr),
2765+
def asi: NVPTXInst<(outs regclass:$result), (ins imemAny:$src, Offseti32imm:$offset),
2766+
"ld.global.nc." # TyStr # " \t$result, [$src$offset];",
27832767
[]>, Requires<[hasLDG]>;
27842768
def ari : NVPTXInst<(outs regclass:$result), (ins MEMri:$src),
2785-
!strconcat("ld.global.nc.", TyStr),
2769+
"ld.global.nc." # TyStr # " \t$result, [$src];",
27862770
[]>, Requires<[hasLDG]>;
27872771
def ari64 : NVPTXInst<(outs regclass:$result), (ins MEMri64:$src),
2788-
!strconcat("ld.global.nc.", TyStr),
2772+
"ld.global.nc." # TyStr # " \t$result, [$src];",
27892773
[]>, Requires<[hasLDG]>;
27902774
}
27912775

2792-
defm INT_PTX_LDG_GLOBAL_i8
2793-
: LDG_G<"u8 \t$result, [$src];", Int16Regs>;
2794-
defm INT_PTX_LDG_GLOBAL_i16
2795-
: LDG_G<"u16 \t$result, [$src];", Int16Regs>;
2796-
defm INT_PTX_LDG_GLOBAL_i32
2797-
: LDG_G<"u32 \t$result, [$src];", Int32Regs>;
2798-
defm INT_PTX_LDG_GLOBAL_i64
2799-
: LDG_G<"u64 \t$result, [$src];", Int64Regs>;
2800-
defm INT_PTX_LDG_GLOBAL_f32
2801-
: LDG_G<"f32 \t$result, [$src];", Float32Regs>;
2802-
defm INT_PTX_LDG_GLOBAL_f64
2803-
: LDG_G<"f64 \t$result, [$src];", Float64Regs>;
2776+
defm INT_PTX_LDG_GLOBAL_i8 : LDG_G<"u8", Int16Regs>;
2777+
defm INT_PTX_LDG_GLOBAL_i16 : LDG_G<"u16", Int16Regs>;
2778+
defm INT_PTX_LDG_GLOBAL_i32 : LDG_G<"u32", Int32Regs>;
2779+
defm INT_PTX_LDG_GLOBAL_i64 : LDG_G<"u64", Int64Regs>;
2780+
defm INT_PTX_LDG_GLOBAL_f32 : LDG_G<"f32", Float32Regs>;
2781+
defm INT_PTX_LDG_GLOBAL_f64 : LDG_G<"f64", Float64Regs>;
28042782

28052783
// vector
28062784

28072785
// Elementized vector ldg
28082786
multiclass VLDG_G_ELE_V2<string TyStr, NVPTXRegClass regclass> {
28092787
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
28102788
(ins MEMri:$src),
2811-
!strconcat("ld.global.nc.", TyStr), []>;
2789+
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
28122790
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
28132791
(ins MEMri64:$src),
2814-
!strconcat("ld.global.nc.", TyStr), []>;
2815-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2816-
(ins imemAny:$src),
2817-
!strconcat("ld.global.nc.", TyStr), []>;
2792+
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src];", []>;
2793+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2),
2794+
(ins imemAny:$src, Offseti32imm:$offset),
2795+
"ld.global.nc.v2." # TyStr # " \t{{$dst1, $dst2}}, [$src$offset];", []>;
28182796
}
28192797

28202798
multiclass VLDG_G_ELE_V4<string TyStr, NVPTXRegClass regclass> {
2821-
def _areg32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2822-
regclass:$dst4), (ins Int32Regs:$src),
2823-
!strconcat("ld.global.nc.", TyStr), []>;
2824-
def _areg64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2825-
regclass:$dst4), (ins Int64Regs:$src),
2826-
!strconcat("ld.global.nc.", TyStr), []>;
28272799
def _ari32: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
28282800
regclass:$dst4), (ins MEMri:$src),
2829-
!strconcat("ld.global.nc.", TyStr), []>;
2801+
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
28302802
def _ari64: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
28312803
regclass:$dst4), (ins MEMri64:$src),
2832-
!strconcat("ld.global.nc.", TyStr), []>;
2833-
def _avar: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2834-
regclass:$dst4), (ins imemAny:$src),
2835-
!strconcat("ld.global.nc.", TyStr), []>;
2804+
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", []>;
2805+
def _asi: NVPTXInst<(outs regclass:$dst1, regclass:$dst2, regclass:$dst3,
2806+
regclass:$dst4), (ins imemAny:$src, Offseti32imm:$offset),
2807+
"ld.global.nc.v4." # TyStr # " \t{{$dst1, $dst2, $dst3, $dst4}}, [$src$offset];", []>;
28362808
}
28372809

28382810
// FIXME: 8-bit LDG should be fixed once LDG/LDU nodes are made into proper loads.
2839-
defm INT_PTX_LDG_G_v2i8_ELE
2840-
: VLDG_G_ELE_V2<"v2.u8 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2841-
defm INT_PTX_LDG_G_v2i16_ELE
2842-
: VLDG_G_ELE_V2<"v2.u16 \t{{$dst1, $dst2}}, [$src];", Int16Regs>;
2843-
defm INT_PTX_LDG_G_v2i32_ELE
2844-
: VLDG_G_ELE_V2<"v2.u32 \t{{$dst1, $dst2}}, [$src];", Int32Regs>;
2845-
defm INT_PTX_LDG_G_v2f32_ELE
2846-
: VLDG_G_ELE_V2<"v2.f32 \t{{$dst1, $dst2}}, [$src];", Float32Regs>;
2847-
defm INT_PTX_LDG_G_v2i64_ELE
2848-
: VLDG_G_ELE_V2<"v2.u64 \t{{$dst1, $dst2}}, [$src];", Int64Regs>;
2849-
defm INT_PTX_LDG_G_v2f64_ELE
2850-
: VLDG_G_ELE_V2<"v2.f64 \t{{$dst1, $dst2}}, [$src];", Float64Regs>;
2851-
defm INT_PTX_LDG_G_v4i8_ELE
2852-
: VLDG_G_ELE_V4<"v4.u8 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2853-
defm INT_PTX_LDG_G_v4i16_ELE
2854-
: VLDG_G_ELE_V4<"v4.u16 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int16Regs>;
2855-
defm INT_PTX_LDG_G_v4i32_ELE
2856-
: VLDG_G_ELE_V4<"v4.u32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Int32Regs>;
2857-
defm INT_PTX_LDG_G_v4f32_ELE
2858-
: VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
2811+
defm INT_PTX_LDG_G_v2i8_ELE : VLDG_G_ELE_V2<"u8", Int16Regs>;
2812+
defm INT_PTX_LDG_G_v2i16_ELE : VLDG_G_ELE_V2<"u16", Int16Regs>;
2813+
defm INT_PTX_LDG_G_v2i32_ELE : VLDG_G_ELE_V2<"u32", Int32Regs>;
2814+
defm INT_PTX_LDG_G_v2f32_ELE : VLDG_G_ELE_V2<"f32", Float32Regs>;
2815+
defm INT_PTX_LDG_G_v2i64_ELE : VLDG_G_ELE_V2<"u64", Int64Regs>;
2816+
defm INT_PTX_LDG_G_v2f64_ELE : VLDG_G_ELE_V2<"f64", Float64Regs>;
2817+
2818+
defm INT_PTX_LDG_G_v4i8_ELE : VLDG_G_ELE_V4<"u8", Int16Regs>;
2819+
defm INT_PTX_LDG_G_v4i16_ELE : VLDG_G_ELE_V4<"u16", Int16Regs>;
2820+
defm INT_PTX_LDG_G_v4i32_ELE : VLDG_G_ELE_V4<"u32", Int32Regs>;
2821+
defm INT_PTX_LDG_G_v4f32_ELE : VLDG_G_ELE_V4<"f32", Float32Regs>;
28592822

28602823

28612824
multiclass NG_TO_G<string Str> {

0 commit comments

Comments
 (0)