Skip to content

Commit 7b29b49

Browse files
authored
[NVPTX] Move TMA G2S lowering to Tablegen (#165710)
This change refactors G2S TMA implementation to use pure TableGen based expansion instead verbose ISel DAG expansion. In addition, it adds proper arch predicates for TMA G2S. All the test cases are validated locally with CUDA 13.0 toolkit.
1 parent 0ed8e66 commit 7b29b49

11 files changed

+96
-249
lines changed

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp

Lines changed: 0 additions & 129 deletions
Original file line numberDiff line numberDiff line change
@@ -1871,17 +1871,6 @@ bool NVPTXScopes::empty() const { return Scopes.size() == 0; }
18711871
(is_ch ? (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, _CH)) \
18721872
: (CP_ASYNC_BULK_TENSOR_OPCODE(RED, dim, mode, is_s32, )))
18731873

1874-
#define GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(dim, mode, is_mc, is_ch, is_s32) \
1875-
[&]() -> auto { \
1876-
if (is_mc && is_ch) \
1877-
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC_CH); \
1878-
if (is_ch) \
1879-
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _CH); \
1880-
if (is_mc) \
1881-
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, _MC); \
1882-
return CP_ASYNC_BULK_TENSOR_OPCODE(G2S, dim, mode, is_s32, ); \
1883-
}()
1884-
18851874
static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
18861875
bool IsShared32,
18871876
bool IsCacheHint,
@@ -1925,112 +1914,6 @@ static unsigned GetCpAsyncBulkTensorS2GReductionOpcode(size_t Dim,
19251914
}
19261915
}
19271916

1928-
static unsigned GetCpAsyncBulkTensorG2SOpcode(size_t Dim, bool IsShared32,
1929-
bool IsMultiCast,
1930-
bool IsCacheHint, bool IsIm2Col) {
1931-
if (IsIm2Col) {
1932-
switch (Dim) {
1933-
case 3:
1934-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, IM2COL, IsMultiCast,
1935-
IsCacheHint, IsShared32);
1936-
case 4:
1937-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, IM2COL, IsMultiCast,
1938-
IsCacheHint, IsShared32);
1939-
case 5:
1940-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, IM2COL, IsMultiCast,
1941-
IsCacheHint, IsShared32);
1942-
default:
1943-
llvm_unreachable("Invalid Dimension in im2col mode for "
1944-
"GetCpAsyncBulkTensorG2SOpcode.");
1945-
}
1946-
} else {
1947-
switch (Dim) {
1948-
case 1:
1949-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(1D, TILE, IsMultiCast,
1950-
IsCacheHint, IsShared32);
1951-
case 2:
1952-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(2D, TILE, IsMultiCast,
1953-
IsCacheHint, IsShared32);
1954-
case 3:
1955-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(3D, TILE, IsMultiCast,
1956-
IsCacheHint, IsShared32);
1957-
case 4:
1958-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(4D, TILE, IsMultiCast,
1959-
IsCacheHint, IsShared32);
1960-
case 5:
1961-
return GET_CP_ASYNC_BULK_TENSOR_OPCODE_G2S(5D, TILE, IsMultiCast,
1962-
IsCacheHint, IsShared32);
1963-
default:
1964-
llvm_unreachable(
1965-
"Invalid Dimension in tile mode for GetCpAsyncBulkTensorG2SOpcode.");
1966-
}
1967-
}
1968-
}
1969-
1970-
static size_t GetDimsFromIntrinsic(unsigned IID) {
1971-
switch (IID) {
1972-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
1973-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_3d:
1974-
return 3;
1975-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
1976-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_4d:
1977-
return 4;
1978-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
1979-
case Intrinsic::nvvm_cp_async_bulk_tensor_prefetch_im2col_5d:
1980-
return 5;
1981-
default:
1982-
llvm_unreachable("Invalid im2col intrinsic in GetDimsFromIntrinsic.");
1983-
}
1984-
}
1985-
1986-
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorG2SCommon(SDNode *N,
1987-
bool IsIm2Col) {
1988-
// We have {Chain, Intrinsic-ID} followed by the actual intrisic args:
1989-
// {dst, mbar, src, dims{d0...dN}, im2col_offsets{dims-2}
1990-
// multicast, cache_hint,
1991-
// multicast_flag, cache_hint_flag, cta_group_flag}
1992-
// NumOperands = {Chain, IID} + {Actual intrinsic args}
1993-
// = {2} + {8 + dims + im2col_offsets}
1994-
size_t NumOps = N->getNumOperands();
1995-
size_t NumDims = IsIm2Col ? GetDimsFromIntrinsic(N->getConstantOperandVal(1))
1996-
: (NumOps - 10);
1997-
// Offsets is always 'NumDims - 2' and only for im2col mode
1998-
size_t NumOffsets = IsIm2Col ? (NumDims - 2) : 0;
1999-
bool IsCacheHint = N->getConstantOperandVal(NumOps - 2) == 1;
2000-
bool IsMultiCast = N->getConstantOperandVal(NumOps - 3) == 1;
2001-
size_t NumBaseArgs = NumDims + NumOffsets + 3; // for {dst, mbar, src}
2002-
size_t MultiCastIdx = NumBaseArgs + 2; // for Chain and IID
2003-
2004-
unsigned CTAGroupVal = N->getConstantOperandVal(NumOps - 1);
2005-
if ((CTAGroupVal > 0) && !Subtarget->hasCpAsyncBulkTensorCTAGroupSupport())
2006-
report_fatal_error(
2007-
formatv("CpAsyncBulkTensorG2S cta_group::1/2 is not supported on sm_{}",
2008-
Subtarget->getSmVersion()));
2009-
2010-
SDLoc DL(N);
2011-
SmallVector<SDValue, 8> Ops(N->ops().slice(2, NumBaseArgs));
2012-
2013-
// Push MultiCast operand, if available
2014-
if (IsMultiCast)
2015-
Ops.push_back(N->getOperand(MultiCastIdx));
2016-
2017-
// Push CacheHint operand, if available
2018-
if (IsCacheHint)
2019-
Ops.push_back(N->getOperand(MultiCastIdx + 1));
2020-
2021-
// Flag for CTA Group
2022-
Ops.push_back(getI32Imm(CTAGroupVal, DL));
2023-
2024-
// Finally, the chain operand
2025-
Ops.push_back(N->getOperand(0));
2026-
2027-
bool IsShared32 =
2028-
CurDAG->getDataLayout().getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32;
2029-
unsigned Opcode = GetCpAsyncBulkTensorG2SOpcode(
2030-
NumDims, IsShared32, IsMultiCast, IsCacheHint, IsIm2Col);
2031-
ReplaceNode(N, CurDAG->getMachineNode(Opcode, DL, N->getVTList(), Ops));
2032-
}
2033-
20341917
void NVPTXDAGToDAGISel::SelectCpAsyncBulkTensorReduceCommon(SDNode *N,
20351918
unsigned RedOp,
20361919
bool IsIm2Col) {
@@ -2175,18 +2058,6 @@ bool NVPTXDAGToDAGISel::tryIntrinsicVoid(SDNode *N) {
21752058
switch (IID) {
21762059
default:
21772060
return false;
2178-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_1d:
2179-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_2d:
2180-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_3d:
2181-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_4d:
2182-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_tile_5d:
2183-
SelectCpAsyncBulkTensorG2SCommon(N);
2184-
return true;
2185-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_3d:
2186-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_4d:
2187-
case Intrinsic::nvvm_cp_async_bulk_tensor_g2s_im2col_5d:
2188-
SelectCpAsyncBulkTensorG2SCommon(N, /*IsIm2Col=*/true);
2189-
return true;
21902061
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_1d:
21912062
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_2d:
21922063
case Intrinsic::nvvm_cp_async_bulk_tensor_reduce_add_tile_3d:

llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,6 @@ class LLVM_LIBRARY_VISIBILITY NVPTXDAGToDAGISel : public SelectionDAGISel {
8686
bool tryEXTRACT_VECTOR_ELEMENT(SDNode *N);
8787
void SelectV2I64toI128(SDNode *N);
8888
void SelectI128toV2I64(SDNode *N);
89-
void SelectCpAsyncBulkTensorG2SCommon(SDNode *N, bool IsIm2Col = false);
9089
void SelectCpAsyncBulkTensorReduceCommon(SDNode *N, unsigned RedOp,
9190
bool IsIm2Col = false);
9291
void SelectTcgen05Ld(SDNode *N, bool hasOffset = false);

llvm/lib/Target/NVPTX/NVPTXInstrInfo.td

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -139,7 +139,6 @@ def noHWROT32 : Predicate<"!Subtarget->hasHWROT32()">;
139139
def hasDotInstructions : Predicate<"Subtarget->hasDotInstructions()">;
140140
def hasTcgen05Instructions : Predicate<"Subtarget->hasTcgen05Instructions()">;
141141
def hasTcgen05MMAScaleInputDImm : Predicate<"Subtarget->hasTcgen05MMAScaleInputDImm()">;
142-
def hasTMACTAGroupSupport : Predicate<"Subtarget->hasCpAsyncBulkTensorCTAGroupSupport()">;
143142
def hasF32x2Instructions : Predicate<"Subtarget->hasF32x2Instructions()">;
144143

145144
class hasPTX<int version>: Predicate<"Subtarget->getPTXVersion() >= " # version>;

llvm/lib/Target/NVPTX/NVPTXIntrinsics.td

Lines changed: 31 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -599,75 +599,15 @@ class TMA_IM2COL_UTIL<int dim, string mode> {
599599
string base_str = !interleave(!foreach(i, !range(offsets), "$im2col" # i), ", ");
600600
}
601601

602-
// From Global to Shared memory (G2S)
603-
class G2S_STRINGS<int dim, string mode, bit mc, bit ch, bit is_shared32 = 0> {
604-
string prefix = "cp.async.bulk.tensor";
605-
string dir = "shared::cluster.global";
606-
string completion = "mbarrier::complete_tx::bytes";
607-
string inst_name = prefix
608-
# "." # dim # "d"
609-
# "." # dir
610-
# "." # mode
611-
# "." # completion
612-
# !if(mc, ".multicast::cluster", "")
613-
# !if(ch, ".L2::cache_hint", "");
614-
string intr_name = "CP_ASYNC_BULK_TENSOR_G2S_"
615-
# dim # "D"
616-
# !if(is_shared32, "_SHARED32", "")
617-
# !if(!eq(mode, "tile"), "_TILE", "_IM2COL");
618-
}
619-
620602
def CTAGroupFlags : Operand<i32> {
621603
let PrintMethod = "printCTAGroup";
622604
}
623605

624-
multiclass CP_ASYNC_BULK_TENSOR_G2S_INTR<int dim, bit is_shared32, string mode> {
625-
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
626-
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
627-
defvar asm_str_default = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
628-
defvar rc = !if(is_shared32, B32, B64);
629-
630-
defvar num_im2col = !if(!ge(dim, 3), !add(dim, -2), 0);
631-
defvar im2col_dag = !if(!eq(mode, "im2col"),
632-
!dag(ins, !listsplat(B16, num_im2col), !foreach(i, !range(num_im2col), "im2col" # i)),
633-
(ins));
634-
defvar im2col_str = !interleave(!foreach(i, !range(num_im2col), "$im2col" # i), ", ");
635-
defvar im2col_asm_str = ", {{" # im2col_str # "}}";
636-
637-
defvar asm_str = !if(!eq(mode, "im2col"),
638-
!strconcat(asm_str_default, im2col_asm_str), asm_str_default);
606+
def tma_cta_group_imm0 : TImmLeaf<i32, [{return Imm == 0;}]>;
607+
def tma_cta_group_imm_any : TImmLeaf<i32, [{return Imm >= 0;}]>;
639608

640-
def "" : NVPTXInst<(outs),
641-
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag, (ins CTAGroupFlags:$cg)),
642-
!strconcat(G2S_STRINGS<dim, mode, 0, 0>.inst_name, asm_str, ";")>,
643-
Requires<[hasPTX<80>, hasSM<90>]>;
644-
def _MC : NVPTXInst<(outs),
645-
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
646-
(ins B16:$mc, CTAGroupFlags:$cg)),
647-
!strconcat(G2S_STRINGS<dim, mode, 1, 0>.inst_name, asm_str, ", $mc;")>,
648-
Requires<[hasPTX<80>, hasSM<90>]>;
649-
def _CH : NVPTXInst<(outs),
650-
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
651-
(ins B64:$ch, CTAGroupFlags:$cg)),
652-
!strconcat(G2S_STRINGS<dim, mode, 0, 1>.inst_name, asm_str, ", $ch;")>,
653-
Requires<[hasPTX<80>, hasSM<90>]>;
654-
def _MC_CH : NVPTXInst<(outs),
655-
!con((ins rc:$dst, rc:$mbar, B64:$tmap), dims_dag, im2col_dag,
656-
(ins B16:$mc, B64:$ch, CTAGroupFlags:$cg)),
657-
!strconcat(G2S_STRINGS<dim, mode, 1, 1>.inst_name, asm_str, ", $mc, $ch;")>,
658-
Requires<[hasPTX<80>, hasSM<90>]>;
659-
}
660-
661-
foreach dim = [1, 2, 3, 4, 5] in {
662-
foreach shared32 = [true, false] in {
663-
foreach mode = !if(!ge(dim, 3), ["tile", "im2col"], ["tile"]) in {
664-
defm G2S_STRINGS<dim, mode, 0, 0, shared32>.intr_name :
665-
CP_ASYNC_BULK_TENSOR_G2S_INTR<dim, shared32, mode>;
666-
}
667-
}
668-
}
669-
670-
multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []> {
609+
multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred,
610+
TImmLeaf cta_group_type = tma_cta_group_imm_any> {
671611
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
672612
defvar dims_str = TMA_DIMS_UTIL<dim>.base_str;
673613
defvar asm_str_base = "$cg [$dst], [$tmap, {{" # dims_str # "}}], [$mbar]";
@@ -697,10 +637,10 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
697637
!setdagop(dims_dag, intr),
698638
!setdagop(im2col_dag, intr),
699639
(intr B16:$mc, B64:$ch));
700-
defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, timm:$cg));
701-
defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, timm:$cg));
702-
defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, timm:$cg));
703-
defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, timm:$cg));
640+
defvar intr_dag_no_hints = !con(intr_dag_base, (intr 0, 0, cta_group_type:$cg));
641+
defvar intr_dag_with_mc = !con(intr_dag_base, (intr -1, 0, cta_group_type:$cg));
642+
defvar intr_dag_with_ch = !con(intr_dag_base, (intr 0, -1, cta_group_type:$cg));
643+
defvar intr_dag_with_mc_ch = !con(intr_dag_base, (intr -1, -1, cta_group_type:$cg));
704644

705645
def "" : NVPTXInst<(outs), ins_dag,
706646
inst_name # asm_str # ";",
@@ -719,14 +659,30 @@ multiclass TMA_TENSOR_G2S_INTR<int dim, string mode, list<Predicate> pred = []>
719659
[intr_dag_with_mc_ch]>,
720660
Requires<pred>;
721661
}
662+
663+
foreach dim = 1...5 in {
664+
defm TMA_G2S_TILE_CG0_ # dim # "D"
665+
: TMA_TENSOR_G2S_INTR<dim, "tile", [hasPTX<80>, hasSM<90>],
666+
tma_cta_group_imm0>;
667+
defm TMA_G2S_TILE_ # dim # "D"
668+
: TMA_TENSOR_G2S_INTR<dim, "tile",
669+
[callSubtarget<"hasTMABlackwellSupport">]>;
670+
}
722671
foreach dim = 3...5 in {
672+
defm TMA_G2S_IM2COL_CG0_ # dim # "D"
673+
: TMA_TENSOR_G2S_INTR<dim, "im2col", [hasPTX<80>, hasSM<90>],
674+
tma_cta_group_imm0>;
675+
defm TMA_G2S_IM2COL_ # dim # "D"
676+
: TMA_TENSOR_G2S_INTR<dim, "im2col",
677+
[callSubtarget<"hasTMABlackwellSupport">]>;
723678
foreach mode = ["im2col_w", "im2col_w_128"] in {
724679
defm TMA_G2S_ # !toupper(mode) # "_" # dim # "D"
725-
: TMA_TENSOR_G2S_INTR<dim, mode, [hasTMACTAGroupSupport]>;
680+
: TMA_TENSOR_G2S_INTR<dim, mode,
681+
[callSubtarget<"hasTMABlackwellSupport">]>;
726682
}
727683
}
728684
defm TMA_G2S_TILE_GATHER4_2D : TMA_TENSOR_G2S_INTR<5, "tile_gather4",
729-
[hasTMACTAGroupSupport]>;
685+
[callSubtarget<"hasTMABlackwellSupport">]>;
730686

731687
multiclass TMA_TENSOR_G2S_CTA_INTR<int dim, string mode, list<Predicate> pred = []> {
732688
defvar dims_dag = TMA_DIMS_UTIL<dim>.ins_dag;
@@ -784,7 +740,8 @@ foreach dim = 3...5 in {
784740
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w", [hasPTX<86>, hasSM<100>]>;
785741

786742
defm TMA_G2S_CTA_IM2COL_W_128_ # dim # "D"
787-
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128", [hasTMACTAGroupSupport]>;
743+
: TMA_TENSOR_G2S_CTA_INTR<dim, "im2col_w_128",
744+
[callSubtarget<"hasTMABlackwellSupport">]>;
788745
}
789746
defm TMA_G2S_CTA_TILE_GATHER4_2D : TMA_TENSOR_G2S_CTA_INTR<5, "tile_gather4",
790747
[hasPTX<86>, hasSM<100>]>;
@@ -835,7 +792,7 @@ foreach dim = 1...5 in {
835792
}
836793
}
837794
defm TMA_S2G_TILE_SCATTER4_2D : TMA_TENSOR_S2G_INTR<5, "tile_scatter4",
838-
[hasTMACTAGroupSupport]>;
795+
[callSubtarget<"hasTMABlackwellSupport">]>;
839796

840797
def TMAReductionFlags : Operand<i32> {
841798
let PrintMethod = "printTmaReductionMode";
@@ -930,11 +887,11 @@ foreach dim = 3...5 in {
930887
foreach mode = ["im2col_w", "im2col_w_128"] in {
931888
defvar suffix = !toupper(mode) # "_" # dim # "D";
932889
defm TMA_TENSOR_PF_ # suffix : TMA_TENSOR_PREFETCH_INTR<dim, mode,
933-
[hasTMACTAGroupSupport]>;
890+
[callSubtarget<"hasTMABlackwellSupport">]>;
934891
}
935892
}
936893
defm TMA_TENSOR_PF_TILE_GATHER4_2D : TMA_TENSOR_PREFETCH_INTR<5, "tile_gather4",
937-
[hasTMACTAGroupSupport]>;
894+
[callSubtarget<"hasTMABlackwellSupport">]>;
938895

939896
//Prefetchu and Prefetch
940897

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 9 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -166,18 +166,15 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
166166
// f32x2 instructions in Blackwell family
167167
bool hasF32x2Instructions() const;
168168

169-
// TMA G2S copy with cta_group::1/2 support
170-
bool hasCpAsyncBulkTensorCTAGroupSupport() const {
171-
// TODO: Update/tidy-up after the family-conditional support arrives
172-
switch (FullSmVersion) {
173-
case 1003:
174-
case 1013:
175-
return PTXVersion >= 86;
176-
case 1033:
177-
return PTXVersion >= 88;
178-
default:
179-
return false;
180-
}
169+
// Checks support for following in TMA:
170+
// - cta_group::1/2 support
171+
// - im2col_w/w_128 mode support
172+
// - tile_gather4 mode support
173+
// - tile_scatter4 mode support
174+
bool hasTMABlackwellSupport() const {
175+
return hasPTXWithFamilySMs(90, {100, 110}) ||
176+
hasPTXWithFamilySMs(88, {100, 101}) ||
177+
hasPTXWithAccelSMs(86, {100, 101});
181178
}
182179

183180
// Prior to CUDA 12.3 ptxas did not recognize that the trap instruction

llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-1cta.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
33
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
4+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
5+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
46
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
57
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
8+
; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
9+
; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
610

711
target triple = "nvptx64-nvidia-cuda"
812

llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-2cta.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
33
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
4+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
5+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
46
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
57
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
8+
; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
9+
; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
610

711
target triple = "nvptx64-nvidia-cuda"
812

llvm/test/CodeGen/NVPTX/cp-async-bulk-tensor-g2s-gather4.ll

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,12 @@
11
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
22
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| FileCheck --check-prefixes=CHECK-PTX64 %s
33
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| FileCheck --check-prefixes=CHECK-PTX-SHARED32 %s
4+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | FileCheck --check-prefixes=CHECK-PTX64 %s
5+
; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | FileCheck --check-prefixes=CHECK-PTX64 %s
46
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86| %ptxas-verify -arch=sm_100a %}
57
; RUN: %if ptxas-sm_100a && ptxas-isa-8.6 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100a -mattr=+ptx86 --nvptx-short-ptr| %ptxas-verify -arch=sm_100a %}
8+
; RUN: %if ptxas-sm_100f && ptxas-isa-8.8 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_100f -mattr=+ptx88 | %ptxas-verify -arch=sm_100f %}
9+
; RUN: %if ptxas-sm_110f && ptxas-isa-9.0 %{ llc < %s -mtriple=nvptx64 -mcpu=sm_110f -mattr=+ptx90 | %ptxas-verify -arch=sm_110f %}
610

711
target triple = "nvptx64-nvidia-cuda"
812

0 commit comments

Comments
 (0)