Skip to content

Commit d01d16c

Browse files
committed
New fix with test updates
1 parent 2ab198f commit d01d16c

File tree

5 files changed

+52
-7
lines changed

5 files changed

+52
-7
lines changed

llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3453,7 +3453,8 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
34533453
}
34543454

34553455
static std::pair<MemSDNode *, uint32_t>
3456-
convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) {
3456+
convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG,
3457+
const NVPTXSubtarget &STI) {
34573458
SDValue Chain = N->getOperand(0);
34583459
SDValue BasePtr = N->getOperand(1);
34593460
SDValue Mask = N->getOperand(3);
@@ -3495,6 +3496,11 @@ convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) {
34953496
MemSDNode *NewLD = cast<MemSDNode>(
34963497
DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());
34973498

3499+
// If our subtarget does not support the used bytes mask pragma, "drop" the
3500+
// mask by setting it to UINT32_MAX
3501+
if (!STI.hasUsedBytesMaskPragma())
3502+
UsedBytesMask = UINT32_MAX;
3503+
34983504
return {NewLD, UsedBytesMask};
34993505
}
35003506

@@ -3531,7 +3537,8 @@ replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {
35313537
// If we have a masked load, convert it to a normal load now
35323538
std::optional<uint32_t> UsedBytesMask = std::nullopt;
35333539
if (LD->getOpcode() == ISD::MLOAD)
3534-
std::tie(LD, UsedBytesMask) = convertMLOADToLoadWithUsedBytesMask(LD, DAG);
3540+
std::tie(LD, UsedBytesMask) =
3541+
convertMLOADToLoadWithUsedBytesMask(LD, DAG, STI);
35353542

35363543
// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
35373544
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
@@ -3667,8 +3674,8 @@ SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
36673674
// them here.
36683675
EVT VT = Op.getValueType();
36693676
if (NVPTX::isPackedVectorTy(VT)) {
3670-
auto Result =
3671-
convertMLOADToLoadWithUsedBytesMask(cast<MemSDNode>(Op.getNode()), DAG);
3677+
auto Result = convertMLOADToLoadWithUsedBytesMask(
3678+
cast<MemSDNode>(Op.getNode()), DAG, STI);
36723679
MemSDNode *LD = std::get<0>(Result);
36733680
uint32_t UsedBytesMask = std::get<1>(Result);
36743681

llvm/lib/Target/NVPTX/NVPTXSubtarget.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -89,6 +89,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
8989
return SmVersion >= 100 && PTXVersion >= 88 &&
9090
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
9191
}
92+
bool hasUsedBytesMaskPragma() const {
93+
return SmVersion >= 50 && PTXVersion >= 83;
94+
}
9295
bool hasAtomAddF64() const { return SmVersion >= 60; }
9396
bool hasAtomScope() const { return SmVersion >= 60; }
9497
bool hasAtomBitwise64() const { return SmVersion >= 32; }

llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -50,7 +50,6 @@ define half @fh(ptr %p) {
5050
; ENABLED-EMPTY:
5151
; ENABLED-NEXT: // %bb.0:
5252
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
53-
; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff";
5453
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
5554
; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
5655
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;

llvm/test/CodeGen/NVPTX/param-vectorize-device.ll

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -171,7 +171,6 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
171171
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
172172
; CHECK-LABEL: callee_St4x3(
173173
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
174-
; CHECK: .pragma "used_bytes_mask 0xfff";
175174
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
176175
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
177176
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
@@ -394,7 +393,6 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
394393
; CHECK-LABEL: callee_St4x7(
395394
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
396395
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
397-
; CHECK: .pragma "used_bytes_mask 0xfff";
398396
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
399397
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
400398
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
2+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | FileCheck %s -check-prefixes=NOMASK
3+
; RUN: %if ptxas-sm_90 && ptxas-isa-8.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | %ptxas-verify -arch=sm_90 %}
4+
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck %s -check-prefixes=MASK
5+
; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}
6+
7+
; On older architectures and versions, we shouldn't be seeing a used bytes mask pragma.
8+
; Specifically, the pragma is only supported on SM_50 or later, and PTX 8.3 or later.
9+
; Architecture fixed at SM_90 for this test for stability, and we vary the PTX version to test the pragma.
10+
11+
define i32 @global_8xi32(ptr %a, ptr %b) {
12+
; NOMASK-LABEL: global_8xi32(
13+
; NOMASK: {
14+
; NOMASK-NEXT: .reg .b32 %r<5>;
15+
; NOMASK-NEXT: .reg .b64 %rd<2>;
16+
; NOMASK-EMPTY:
17+
; NOMASK-NEXT: // %bb.0:
18+
; NOMASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
19+
; NOMASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
20+
; NOMASK-NEXT: st.param.b32 [func_retval0], %r1;
21+
; NOMASK-NEXT: ret;
22+
;
23+
; MASK-LABEL: global_8xi32(
24+
; MASK: {
25+
; MASK-NEXT: .reg .b32 %r<5>;
26+
; MASK-NEXT: .reg .b64 %rd<2>;
27+
; MASK-EMPTY:
28+
; MASK-NEXT: // %bb.0:
29+
; MASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
30+
; MASK-NEXT: .pragma "used_bytes_mask 0xfff";
31+
; MASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
32+
; MASK-NEXT: st.param.b32 [func_retval0], %r1;
33+
; MASK-NEXT: ret;
34+
%a.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 %a, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
35+
%first = extractelement <4 x i32> %a.load, i32 0
36+
ret i32 %first
37+
}
38+
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr , <4 x i1>, <4 x i32>)

0 commit comments

Comments
 (0)