Skip to content

Commit 64909aa

Browse files
committed
Merge remote-tracking branch 'origin/main' into vplan-runtime-checks
2 parents 5ea6f7b + a5a1612 commit 64909aa

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

45 files changed

+2387
-367
lines changed

clang/test/Driver/riscv-profiles.c

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
// RVA22U64: "-target-feature" "+f"
5858
// RVA22U64: "-target-feature" "+d"
5959
// RVA22U64: "-target-feature" "+c"
60+
// RVA22U64: "-target-feature" "+b"
6061
// RVA22U64: "-target-feature" "+zic64b"
6162
// RVA22U64: "-target-feature" "+zicbom"
6263
// RVA22U64: "-target-feature" "+zicbop"
@@ -83,6 +84,7 @@
8384
// RVA22S64: "-target-feature" "+f"
8485
// RVA22S64: "-target-feature" "+d"
8586
// RVA22S64: "-target-feature" "+c"
87+
// RVA22S64: "-target-feature" "+b"
8688
// RVA22S64: "-target-feature" "+zic64b"
8789
// RVA22S64: "-target-feature" "+zicbom"
8890
// RVA22S64: "-target-feature" "+zicbop"
@@ -118,6 +120,7 @@
118120
// RVA23U64: "-target-feature" "+f"
119121
// RVA23U64: "-target-feature" "+d"
120122
// RVA23U64: "-target-feature" "+c"
123+
// RVA23U64: "-target-feature" "+b"
121124
// RVA23U64: "-target-feature" "+v"
122125
// RVA23U64: "-target-feature" "+zic64b"
123126
// RVA23U64: "-target-feature" "+zicbom"
@@ -156,6 +159,7 @@
156159
// RVA23S64: "-target-feature" "+f"
157160
// RVA23S64: "-target-feature" "+d"
158161
// RVA23S64: "-target-feature" "+c"
162+
// RVA23S64: "-target-feature" "+b"
159163
// RVA23S64: "-target-feature" "+v"
160164
// RVA23S64: "-target-feature" "+h"
161165
// RVA23S64: "-target-feature" "+zic64b"
@@ -217,6 +221,7 @@
217221
// RVB23U64: "-target-feature" "+f"
218222
// RVB23U64: "-target-feature" "+d"
219223
// RVB23U64: "-target-feature" "+c"
224+
// RVB23U64: "-target-feature" "+b"
220225
// RVB23U64: "-target-feature" "+zic64b"
221226
// RVB23U64: "-target-feature" "+zicbom"
222227
// RVB23U64: "-target-feature" "+zicbop"
@@ -249,6 +254,7 @@
249254
// RVB23S64: "-target-feature" "+f"
250255
// RVB23S64: "-target-feature" "+d"
251256
// RVB23S64: "-target-feature" "+c"
257+
// RVB23S64: "-target-feature" "+b"
252258
// RVB23S64: "-target-feature" "+zic64b"
253259
// RVB23S64: "-target-feature" "+zicbom"
254260
// RVB23S64: "-target-feature" "+zicbop"
@@ -290,6 +296,7 @@
290296
// RUN: %clang --target=riscv32 -### -c %s 2>&1 -march=rvm23u32 -menable-experimental-extensions \
291297
// RUN: | FileCheck -check-prefix=RVM23U32 %s
292298
// RVM23U32: "-target-feature" "+m"
299+
// RVM23U32: "-target-feature" "+b"
293300
// RVM23U32: "-target-feature" "+zicbop"
294301
// RVM23U32: "-target-feature" "+zicond"
295302
// RVM23U32: "-target-feature" "+zicsr"
@@ -309,6 +316,7 @@
309316
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+f"
310317
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+d"
311318
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+c"
319+
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+b"
312320
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbom"
313321
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbop"
314322
// PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicboz"

compiler-rt/lib/ctx_profile/CtxInstrContextNode.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
//==============================================================================
99
//
1010
// NOTE!
11-
// llvm/lib/ProfileData/CtxInstrContextNode.h and
11+
// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and
1212
// compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
13-
// must be exact copies of each other
13+
// must be exact copies of each other.
1414
//
1515
// compiler-rt creates these objects as part of the instrumentation runtime for
1616
// contextual profiling. LLVM only consumes them to convert a contextual tree
@@ -114,4 +114,4 @@ class ContextNode final {
114114
};
115115
} // namespace ctx_profile
116116
} // namespace llvm
117-
#endif
117+
#endif

lld/test/ELF/aarch64-feature-pac.s

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -76,12 +76,14 @@
7676
# PACDYN-NOT: 0x0000000070000001 (AARCH64_BTI_PLT)
7777
# PACDYN-NOT: 0x0000000070000003 (AARCH64_PAC_PLT)
7878

79-
## Turn on PAC entries with the -z pac-plt command line option. There are no
80-
## warnings in this case as the choice to use PAC in PLT entries is orthogonal
81-
## to the choice of using PAC in relocatable objects. The presence of the PAC
82-
## .note.gnu.property is an indication of preference by the relocatable object.
79+
## Turn on PAC entries with the -z pac-plt command line option. For files w/o
80+
## GNU_PROPERTY_AARCH64_FEATURE_1_PAC set in GNU_PROPERTY_AARCH64_FEATURE_1_AND
81+
## property, emit a warning.
82+
83+
# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe 2>&1 | FileCheck -DFILE=%t2.o --check-prefix WARN %s
84+
85+
# WARN: warning: [[FILE]]: -z pac-plt: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_PAC property
8386

84-
# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe
8587
# RUN: llvm-readelf -n %tpacplt.exe | FileCheck --check-prefix=PACPROP %s
8688
# RUN: llvm-readelf --dynamic-table %tpacplt.exe | FileCheck --check-prefix PACDYN2 %s
8789
# RUN: llvm-objdump --no-print-imm-hex -d --mattr=+v8.3a --no-show-raw-insn %tpacplt.exe | FileCheck --check-prefix PACPLT %s

llvm/docs/NVPTXUsage.rst

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above.
599599
For more information, refer PTX ISA
600600
`<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
601601

602+
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``'
603+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
604+
605+
Syntax:
606+
"""""""
607+
608+
.. code-block:: llvm
609+
610+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
611+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...)
612+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
613+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
614+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
615+
616+
Overview:
617+
"""""""""
618+
619+
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics
620+
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
621+
of PTX instructions. These instructions initiate an asynchronous prefetch
622+
of tensor data from global memory to the L2 cache. In tile mode, the
623+
multi-dimensional layout of the source tensor is preserved at the destination.
624+
The dimension of the tensor data ranges from 1d to 5d with the coordinates
625+
specified by the ``i32 %d0 ... i32 %d4`` arguments.
626+
627+
* The last argument to these intrinsics is a boolean flag
628+
indicating support for cache_hint. This flag argument must
629+
be a compile-time constant. When set, it indicates a valid
630+
cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
631+
variant of the PTX instruction.
632+
633+
For more information, refer PTX ISA
634+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
635+
636+
'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
637+
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
638+
639+
Syntax:
640+
"""""""
641+
642+
.. code-block:: llvm
643+
644+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
645+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
646+
declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
647+
648+
Overview:
649+
"""""""""
650+
651+
The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
652+
correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
653+
of PTX instructions. These instructions initiate an asynchronous prefetch
654+
of tensor data from global memory to the L2 cache. In im2col mode, some
655+
dimensions of the source tensor are unrolled into a single dimensional
656+
column at the destination. In this mode, the tensor has to be at least
657+
three-dimensional. Along with the tensor coordinates, im2col offsets are
658+
also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
659+
of im2col offsets is two less than the number of dimensions of the tensor
660+
operation. The last argument to these intrinsics is a boolean flag, with
661+
the same functionality as described in the ``tile`` mode intrinsics above.
662+
663+
For more information, refer PTX ISA
664+
`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
665+
602666
Other Intrinsics
603667
----------------
604668

llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1102,6 +1102,13 @@ class LegalizeRuleSet {
11021102
return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy);
11031103
}
11041104

1105+
LegalizeRuleSet &clampScalar(bool Pred, unsigned TypeIdx, const LLT MinTy,
1106+
const LLT MaxTy) {
1107+
if (!Pred)
1108+
return *this;
1109+
return clampScalar(TypeIdx, MinTy, MaxTy);
1110+
}
1111+
11051112
/// Limit the range of scalar sizes to MinTy and MaxTy.
11061113
LegalizeRuleSet &clampScalarOrElt(unsigned TypeIdx, const LLT MinTy,
11071114
const LLT MaxTy) {

llvm/include/llvm/IR/IntrinsicsNVVM.td

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -613,6 +613,28 @@ class CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, string mode> {
613613
ImmArg<ArgIndex<FlagsStartIdx>>];
614614
}
615615

616+
class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
617+
string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d";
618+
619+
bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0);
620+
int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0);
621+
list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
622+
list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
623+
list<LLVMType> ArgsTy = !listconcat(
624+
[llvm_ptr_ty], // tensormap_ptr
625+
TensorDimsTy, // actual tensor dims
626+
Im2ColOffsetsTy, // im2col offsets
627+
[llvm_i64_ty, // cache_hint
628+
llvm_i1_ty] // Flag for cache_hint
629+
);
630+
631+
int TempFlagsStartIdx = !add(dim, 2);
632+
int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets);
633+
list<IntrinsicProperty> IntrProp = [IntrConvergent,
634+
ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
635+
ImmArg<ArgIndex<FlagsStartIdx>>];
636+
}
637+
616638
let TargetPrefix = "nvvm" in {
617639
def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
618640
DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4902,6 +4924,8 @@ foreach dim = [1, 2, 3, 4, 5] in {
49024924
def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>;
49034925
foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, mode>] in
49044926
def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>;
4927+
foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>] in
4928+
def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>;
49054929
}
49064930
}
49074931

llvm/include/llvm/ProfileData/CtxInstrContextNode.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,9 +8,9 @@
88
//==============================================================================
99
//
1010
// NOTE!
11-
// llvm/lib/ProfileData/CtxInstrContextNode.h and
11+
// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and
1212
// compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
13-
// must be exact copies of each other
13+
// must be exact copies of each other.
1414
//
1515
// compiler-rt creates these objects as part of the instrumentation runtime for
1616
// contextual profiling. LLVM only consumes them to convert a contextual tree
@@ -114,4 +114,4 @@ class ContextNode final {
114114
};
115115
} // namespace ctx_profile
116116
} // namespace llvm
117-
#endif
117+
#endif

llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -995,9 +995,9 @@ static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
995995
LLT Ty = MRI.getType(Reg);
996996
const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
997997
const TargetRegisterClass *RC =
998-
RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
998+
dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
999999
if (!RC) {
1000-
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
1000+
const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
10011001
RC = getRegClassForTypeOnBank(Ty, RB);
10021002
if (!RC) {
10031003
LLVM_DEBUG(
@@ -2590,14 +2590,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
25902590
const RegClassOrRegBank &RegClassOrBank =
25912591
MRI.getRegClassOrRegBank(DefReg);
25922592

2593-
const TargetRegisterClass *DefRC
2594-
= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
2593+
const TargetRegisterClass *DefRC =
2594+
dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
25952595
if (!DefRC) {
25962596
if (!DefTy.isValid()) {
25972597
LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
25982598
return false;
25992599
}
2600-
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
2600+
const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
26012601
DefRC = getRegClassForTypeOnBank(DefTy, RB);
26022602
if (!DefRC) {
26032603
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
@@ -4677,7 +4677,7 @@ AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
46774677
// If we used a register class, then this won't necessarily have an LLT.
46784678
// Compute the size based off whether or not we have a class or bank.
46794679
unsigned Size;
4680-
if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
4680+
if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
46814681
Size = TRI.getRegSizeInBits(*RC);
46824682
else
46834683
Size = MRI.getType(Dst).getSizeInBits();

llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
8181

8282
auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
8383
const TargetRegisterClass *RC =
84-
RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
84+
dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
8585
if (RC) {
8686
const LLT Ty = MRI.getType(Reg);
8787
if (!Ty.isValid() || Ty.getSizeInBits() != 1)
@@ -91,7 +91,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
9191
RC->hasSuperClassEq(TRI.getBoolRC());
9292
}
9393

94-
const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
94+
const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
9595
return RB->getID() == AMDGPU::VCCRegBankID;
9696
}
9797

@@ -233,15 +233,15 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
233233
const RegClassOrRegBank &RegClassOrBank =
234234
MRI->getRegClassOrRegBank(DefReg);
235235

236-
const TargetRegisterClass *DefRC
237-
= RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
236+
const TargetRegisterClass *DefRC =
237+
dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
238238
if (!DefRC) {
239239
if (!DefTy.isValid()) {
240240
LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
241241
return false;
242242
}
243243

244-
const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
244+
const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
245245
DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
246246
if (!DefRC) {
247247
LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
@@ -2395,11 +2395,11 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
23952395
Register Reg, const MachineRegisterInfo &MRI,
23962396
const TargetRegisterInfo &TRI) const {
23972397
const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
2398-
if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
2398+
if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
23992399
return RB;
24002400

24012401
// Ignore the type, since we don't use vcc in artifacts.
2402-
if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
2402+
if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
24032403
return &RBI.getRegBankFromRegClass(*RC, LLT());
24042404
return nullptr;
24052405
}

llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3682,10 +3682,10 @@ const TargetRegisterClass *
36823682
SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
36833683
const MachineRegisterInfo &MRI) const {
36843684
const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
3685-
if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
3685+
if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
36863686
return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
36873687

3688-
if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
3688+
if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
36893689
return getAllocatableClass(RC);
36903690

36913691
return nullptr;

0 commit comments

Comments
 (0)