Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -3453,7 +3453,8 @@ SDValue NVPTXTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
}

static std::pair<MemSDNode *, uint32_t>
convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) {
convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG,
const NVPTXSubtarget &STI) {
SDValue Chain = N->getOperand(0);
SDValue BasePtr = N->getOperand(1);
SDValue Mask = N->getOperand(3);
Expand Down Expand Up @@ -3495,6 +3496,11 @@ convertMLOADToLoadWithUsedBytesMask(MemSDNode *N, SelectionDAG &DAG) {
MemSDNode *NewLD = cast<MemSDNode>(
DAG.getLoad(ResVT, DL, Chain, BasePtr, N->getMemOperand()).getNode());

// If our subtarget does not support the used bytes mask pragma, "drop" the
// mask by setting it to UINT32_MAX
if (!STI.hasUsedBytesMaskPragma())
UsedBytesMask = UINT32_MAX;

return {NewLD, UsedBytesMask};
}

Expand Down Expand Up @@ -3531,7 +3537,8 @@ replaceLoadVector(SDNode *N, SelectionDAG &DAG, const NVPTXSubtarget &STI) {
// If we have a masked load, convert it to a normal load now
std::optional<uint32_t> UsedBytesMask = std::nullopt;
if (LD->getOpcode() == ISD::MLOAD)
std::tie(LD, UsedBytesMask) = convertMLOADToLoadWithUsedBytesMask(LD, DAG);
std::tie(LD, UsedBytesMask) =
convertMLOADToLoadWithUsedBytesMask(LD, DAG, STI);

// Since LoadV2 is a target node, we cannot rely on DAG type legalization.
// Therefore, we must ensure the type is legal. For i1 and i8, we set the
Expand Down Expand Up @@ -3667,8 +3674,8 @@ SDValue NVPTXTargetLowering::LowerMLOAD(SDValue Op, SelectionDAG &DAG) const {
// them here.
EVT VT = Op.getValueType();
if (NVPTX::isPackedVectorTy(VT)) {
auto Result =
convertMLOADToLoadWithUsedBytesMask(cast<MemSDNode>(Op.getNode()), DAG);
auto Result = convertMLOADToLoadWithUsedBytesMask(
cast<MemSDNode>(Op.getNode()), DAG, STI);
MemSDNode *LD = std::get<0>(Result);
uint32_t UsedBytesMask = std::get<1>(Result);

Expand Down
3 changes: 3 additions & 0 deletions llvm/lib/Target/NVPTX/NVPTXSubtarget.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ class NVPTXSubtarget : public NVPTXGenSubtargetInfo {
return SmVersion >= 100 && PTXVersion >= 88 &&
AS == NVPTXAS::ADDRESS_SPACE_GLOBAL;
}
bool hasUsedBytesMaskPragma() const {
return SmVersion >= 50 && PTXVersion >= 83;
}
bool hasAtomAddF64() const { return SmVersion >= 60; }
bool hasAtomScope() const { return SmVersion >= 60; }
bool hasAtomBitwise64() const { return SmVersion >= 32; }
Expand Down
1 change: 0 additions & 1 deletion llvm/test/CodeGen/NVPTX/LoadStoreVectorizer.ll
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,6 @@ define half @fh(ptr %p) {
; ENABLED-EMPTY:
; ENABLED-NEXT: // %bb.0:
; ENABLED-NEXT: ld.param.b64 %rd1, [fh_param_0];
; ENABLED-NEXT: .pragma "used_bytes_mask 0x3ff";
; ENABLED-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; ENABLED-NEXT: { .reg .b16 tmp; mov.b32 {%rs1, tmp}, %r3; }
; ENABLED-NEXT: mov.b32 {%rs2, %rs3}, %r2;
Expand Down
2 changes: 0 additions & 2 deletions llvm/test/CodeGen/NVPTX/param-vectorize-device.ll
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,6 @@ define internal fastcc [3 x i32] @callee_St4x3(ptr nocapture noundef readonly by
; CHECK: .func (.param .align 16 .b8 func_retval0[12])
; CHECK-LABEL: callee_St4x3(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x3_param_0[12]
; CHECK: .pragma "used_bytes_mask 0xfff";
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], %{{.*}}}, [callee_St4x3_param_0];
; CHECK-DAG: st.param.v2.b32 [func_retval0], {[[R1]], [[R2]]};
; CHECK-DAG: st.param.b32 [func_retval0+8], [[R3]];
Expand Down Expand Up @@ -394,7 +393,6 @@ define internal fastcc [7 x i32] @callee_St4x7(ptr nocapture noundef readonly by
; CHECK-LABEL: callee_St4x7(
; CHECK-NEXT: .param .align 16 .b8 callee_St4x7_param_0[28]
; CHECK: ld.param.v4.b32 {[[R1:%r[0-9]+]], [[R2:%r[0-9]+]], [[R3:%r[0-9]+]], [[R4:%r[0-9]+]]}, [callee_St4x7_param_0];
; CHECK: .pragma "used_bytes_mask 0xfff";
; CHECK: ld.param.v4.b32 {[[R5:%r[0-9]+]], [[R6:%r[0-9]+]], [[R7:%r[0-9]+]], %{{.*}}}, [callee_St4x7_param_0+16];
; CHECK-DAG: st.param.v4.b32 [func_retval0], {[[R1]], [[R2]], [[R3]], [[R4]]};
; CHECK-DAG: st.param.v2.b32 [func_retval0+16], {[[R5]], [[R6]]};
Expand Down
38 changes: 38 additions & 0 deletions llvm/test/CodeGen/NVPTX/used-bytes-mask.ll
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | FileCheck %s -check-prefixes=NOMASK
; RUN: %if ptxas-sm_90 && ptxas-isa-8.2 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx82 | %ptxas-verify -arch=sm_90 %}
; RUN: llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | FileCheck %s -check-prefixes=MASK
; RUN: %if ptxas-sm_90 && ptxas-isa-8.3 %{ llc < %s -march=nvptx64 -mcpu=sm_90 -mattr=+ptx83 | %ptxas-verify -arch=sm_90 %}

; On older architectures and versions, we shouldn't be seeing a used bytes mask pragma.
; Specifically, the pragma is only supported on SM_50 or later, and PTX 8.3 or later.
; Architecture fixed at SM_90 for this test for stability, and we vary the PTX version to test the pragma.

define i32 @global_8xi32(ptr %a, ptr %b) {
; NOMASK-LABEL: global_8xi32(
; NOMASK: {
; NOMASK-NEXT: .reg .b32 %r<5>;
; NOMASK-NEXT: .reg .b64 %rd<2>;
; NOMASK-EMPTY:
; NOMASK-NEXT: // %bb.0:
; NOMASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
; NOMASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; NOMASK-NEXT: st.param.b32 [func_retval0], %r1;
; NOMASK-NEXT: ret;
;
; MASK-LABEL: global_8xi32(
; MASK: {
; MASK-NEXT: .reg .b32 %r<5>;
; MASK-NEXT: .reg .b64 %rd<2>;
; MASK-EMPTY:
; MASK-NEXT: // %bb.0:
; MASK-NEXT: ld.param.b64 %rd1, [global_8xi32_param_0];
; MASK-NEXT: .pragma "used_bytes_mask 0xfff";
; MASK-NEXT: ld.v4.b32 {%r1, %r2, %r3, %r4}, [%rd1];
; MASK-NEXT: st.param.b32 [func_retval0], %r1;
; MASK-NEXT: ret;
%a.load = tail call <4 x i32> @llvm.masked.load.v4i32.p0(ptr align 16 %a, <4 x i1> <i1 true, i1 true, i1 true, i1 false>, <4 x i32> poison)
%first = extractelement <4 x i32> %a.load, i32 0
ret i32 %first
}
declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr , <4 x i1>, <4 x i32>)