fhahn
diff --git a/‎clang/test/Driver/riscv-profiles.c
Lines changed: 8 additions & 0 deletions b/‎clang/test/Driver/riscv-profiles.c
Lines changed: 8 additions & 0 deletions
diff --git a/‎compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
Lines changed: 3 additions & 3 deletions b/‎compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎lld/test/ELF/aarch64-feature-pac.s
Lines changed: 7 additions & 5 deletions b/‎lld/test/ELF/aarch64-feature-pac.s
Lines changed: 7 additions & 5 deletions
diff --git a/‎llvm/docs/NVPTXUsage.rst
Lines changed: 64 additions & 0 deletions b/‎llvm/docs/NVPTXUsage.rst
Lines changed: 64 additions & 0 deletions
diff --git a/‎llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
Lines changed: 7 additions & 0 deletions b/‎llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎llvm/include/llvm/IR/IntrinsicsNVVM.td
Lines changed: 24 additions & 0 deletions b/‎llvm/include/llvm/IR/IntrinsicsNVVM.td
Lines changed: 24 additions & 0 deletions
diff --git a/‎llvm/include/llvm/ProfileData/CtxInstrContextNode.h
Lines changed: 3 additions & 3 deletions b/‎llvm/include/llvm/ProfileData/CtxInstrContextNode.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
Lines changed: 6 additions & 6 deletions b/‎llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
Lines changed: 6 additions & 6 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 7 additions & 7 deletions b/‎llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
Lines changed: 7 additions & 7 deletions
diff --git a/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Lines changed: 2 additions & 2 deletions b/‎llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
Lines changed: 2 additions & 2 deletions
@@ -57,6 +57,7 @@
 // RVA22U64: "-target-feature" "+f"
 // RVA22U64: "-target-feature" "+d"
 // RVA22U64: "-target-feature" "+c"
+// RVA22U64: "-target-feature" "+b"
 // RVA22U64: "-target-feature" "+zic64b"
 // RVA22U64: "-target-feature" "+zicbom"
 // RVA22U64: "-target-feature" "+zicbop"
@@ -83,6 +84,7 @@
 // RVA22S64: "-target-feature" "+f"
 // RVA22S64: "-target-feature" "+d"
 // RVA22S64: "-target-feature" "+c"
+// RVA22S64: "-target-feature" "+b"
 // RVA22S64: "-target-feature" "+zic64b"
 // RVA22S64: "-target-feature" "+zicbom"
 // RVA22S64: "-target-feature" "+zicbop"
@@ -118,6 +120,7 @@
 // RVA23U64: "-target-feature" "+f"
 // RVA23U64: "-target-feature" "+d"
 // RVA23U64: "-target-feature" "+c"
+// RVA23U64: "-target-feature" "+b"
 // RVA23U64: "-target-feature" "+v"
 // RVA23U64: "-target-feature" "+zic64b"
 // RVA23U64: "-target-feature" "+zicbom"
@@ -156,6 +159,7 @@
 // RVA23S64: "-target-feature" "+f"
 // RVA23S64: "-target-feature" "+d"
 // RVA23S64: "-target-feature" "+c"
+// RVA23S64: "-target-feature" "+b"
 // RVA23S64: "-target-feature" "+v"
 // RVA23S64: "-target-feature" "+h"
 // RVA23S64: "-target-feature" "+zic64b"
@@ -217,6 +221,7 @@
 // RVB23U64: "-target-feature" "+f"
 // RVB23U64: "-target-feature" "+d"
 // RVB23U64: "-target-feature" "+c"
+// RVB23U64: "-target-feature" "+b"
 // RVB23U64: "-target-feature" "+zic64b"
 // RVB23U64: "-target-feature" "+zicbom"
 // RVB23U64: "-target-feature" "+zicbop"
@@ -249,6 +254,7 @@
 // RVB23S64: "-target-feature" "+f"
 // RVB23S64: "-target-feature" "+d"
 // RVB23S64: "-target-feature" "+c"
+// RVB23S64: "-target-feature" "+b"
 // RVB23S64: "-target-feature" "+zic64b"
 // RVB23S64: "-target-feature" "+zicbom"
 // RVB23S64: "-target-feature" "+zicbop"
@@ -290,6 +296,7 @@
 // RUN: %clang --target=riscv32 -### -c %s 2>&1 -march=rvm23u32 -menable-experimental-extensions \
 // RUN:   | FileCheck -check-prefix=RVM23U32 %s
 // RVM23U32: "-target-feature" "+m"
+// RVM23U32: "-target-feature" "+b"
 // RVM23U32: "-target-feature" "+zicbop"
 // RVM23U32: "-target-feature" "+zicond"
 // RVM23U32: "-target-feature" "+zicsr"
@@ -309,6 +316,7 @@
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+f"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+d"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+c"
+// PROFILE-WITH-ADDITIONAL: "-target-feature" "+b"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbom"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicbop"
 // PROFILE-WITH-ADDITIONAL: "-target-feature" "+zicboz"
 
@@ -8,9 +8,9 @@
 //==============================================================================
 //
 // NOTE!
-// llvm/lib/ProfileData/CtxInstrContextNode.h and
+// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and
 //   compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
-// must be exact copies of each other
+// must be exact copies of each other.
 //
 // compiler-rt creates these objects as part of the instrumentation runtime for
 // contextual profiling. LLVM only consumes them to convert a contextual tree
@@ -114,4 +114,4 @@ class ContextNode final {
 };
 } // namespace ctx_profile
 } // namespace llvm
-#endif
+#endif
@@ -76,12 +76,14 @@
 # PACDYN-NOT:      0x0000000070000001 (AARCH64_BTI_PLT)
 # PACDYN-NOT:      0x0000000070000003 (AARCH64_PAC_PLT)
 
-## Turn on PAC entries with the -z pac-plt command line option. There are no
-## warnings in this case as the choice to use PAC in PLT entries is orthogonal
-## to the choice of using PAC in relocatable objects. The presence of the PAC
-## .note.gnu.property is an indication of preference by the relocatable object.
+## Turn on PAC entries with the -z pac-plt command line option. For files w/o
+## GNU_PROPERTY_AARCH64_FEATURE_1_PAC set in GNU_PROPERTY_AARCH64_FEATURE_1_AND
+## property, emit a warning.
+
+# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe 2>&1 | FileCheck -DFILE=%t2.o --check-prefix WARN %s
+
+# WARN: warning: [[FILE]]: -z pac-plt: file does not have GNU_PROPERTY_AARCH64_FEATURE_1_PAC property
 
-# RUN: ld.lld %t.o %t2.o -z pac-plt %t.so -o %tpacplt.exe
 # RUN: llvm-readelf -n %tpacplt.exe | FileCheck --check-prefix=PACPROP %s
 # RUN: llvm-readelf --dynamic-table %tpacplt.exe | FileCheck --check-prefix PACDYN2 %s
 # RUN: llvm-objdump --no-print-imm-hex -d --mattr=+v8.3a --no-show-raw-insn %tpacplt.exe | FileCheck --check-prefix PACPLT %s
 
@@ -599,6 +599,70 @@ described in the ``s2g.tile`` mode intrinsics above.
 For more information, refer PTX ISA
 `<https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async-bulk-tensor>`_.
 
+'``llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.1d(ptr %tensor_map, i32 %d0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.2d(..., i32 %d0, i32 %d1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.3d(..., i32 %d0, i32 %d1, i32 %d2, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.tile.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
+of PTX instructions. These instructions initiate an asynchronous prefetch
+of tensor data from global memory to the L2 cache. In tile mode, the
+multi-dimensional layout of the source tensor is preserved at the destination.
+The dimension of the tensor data ranges from 1d to 5d with the coordinates
+specified by the ``i32 %d0 ... i32 %d4`` arguments.
+
+* The last argument to these intrinsics is a boolean flag
+  indicating support for cache_hint. This flag argument must
+  be a compile-time constant. When set, it indicates a valid
+  cache_hint (``i64 %ch``) and generates the ``.L2::cache_hint``
+  variant of the PTX instruction.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
+
+'``llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``'
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+.. code-block:: llvm
+
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.3d(ptr %tensor_map, i32 %d0, i32 %d1, i32 %d2, i16 %im2col0, i64 %ch, i1 %flag_ch)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.4d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i16 %im2col0, i16 %im2col1, ...)
+  declare void @llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.5d(..., i32 %d0, i32 %d1, i32 %d2, i32 %d3, i32 %d4, i16 %im2col0, i16 %im2col1, i16 %im2col2, ...)
+
+Overview:
+"""""""""
+
+The '``@llvm.nvvm.cp.async.bulk.tensor.prefetch.im2col.[1-5]d``' intrinsics
+correspond to the ``cp.async.bulk.prefetch.tensor.[1-5]d.L2.global*`` set
+of PTX instructions. These instructions initiate an asynchronous prefetch
+of tensor data from global memory to the L2 cache. In im2col mode, some
+dimensions of the source tensor are unrolled into a single dimensional
+column at the destination. In this mode, the tensor has to be at least
+three-dimensional. Along with the tensor coordinates, im2col offsets are
+also specified (denoted by ``i16 im2col0...i16 %im2col2``). The number
+of im2col offsets is two less than the number of dimensions of the tensor
+operation. The last argument to these intrinsics is a boolean flag, with
+the same functionality as described in the ``tile`` mode intrinsics above.
+
+For more information, refer PTX ISA
+`<https://docs.nvidia.com/cuda/parallel-thread-execution/#data-movement-and-conversion-instructions-cp-async-bulk-prefetch-tensor>`_.
+
 Other Intrinsics
 ----------------
 
 
@@ -1102,6 +1102,13 @@ class LegalizeRuleSet {
     return minScalar(TypeIdx, MinTy).maxScalar(TypeIdx, MaxTy);
   }
 
+  LegalizeRuleSet &clampScalar(bool Pred, unsigned TypeIdx, const LLT MinTy,
+                               const LLT MaxTy) {
+    if (!Pred)
+      return *this;
+    return clampScalar(TypeIdx, MinTy, MaxTy);
+  }
+
   /// Limit the range of scalar sizes to MinTy and MaxTy.
   LegalizeRuleSet &clampScalarOrElt(unsigned TypeIdx, const LLT MinTy,
                                     const LLT MaxTy) {
 
@@ -613,6 +613,28 @@ class CP_ASYNC_BULK_TENSOR_S2G_INTR<int dim, string mode> {
         ImmArg<ArgIndex<FlagsStartIdx>>];
 }
 
+class CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<int dim, string mode> {
+  string Name = "int_nvvm_cp_async_bulk_tensor_prefetch_" # mode # "_" # dim # "d";
+
+  bit IsIm2Col = !if(!eq(mode, "im2col"), 1, 0);
+  int NumIm2ColOffsets = !if(IsIm2Col, !add(dim, -2), 0);
+  list<LLVMType> Im2ColOffsetsTy = !listsplat(llvm_i16_ty, NumIm2ColOffsets);
+  list<LLVMType> TensorDimsTy = !listsplat(llvm_i32_ty, dim);
+  list<LLVMType> ArgsTy = !listconcat(
+                          [llvm_ptr_ty],     // tensormap_ptr
+                           TensorDimsTy,     // actual tensor dims
+                           Im2ColOffsetsTy,  // im2col offsets
+                          [llvm_i64_ty,      // cache_hint
+                           llvm_i1_ty]       // Flag for cache_hint
+                          );
+
+  int TempFlagsStartIdx = !add(dim, 2);
+  int FlagsStartIdx = !add(TempFlagsStartIdx, NumIm2ColOffsets);
+  list<IntrinsicProperty> IntrProp = [IntrConvergent,
+        ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>,
+        ImmArg<ArgIndex<FlagsStartIdx>>];
+}
+
 let TargetPrefix = "nvvm" in {
   def int_nvvm_prmt : ClangBuiltin<"__nvvm_prmt">,
       DefaultAttrsIntrinsic<[llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
@@ -4902,6 +4924,8 @@ foreach dim = [1, 2, 3, 4, 5] in {
       def g2s.Name : DefaultAttrsIntrinsic<[], g2s.ArgsTy, g2s.IntrProp>;
     foreach s2g = [CP_ASYNC_BULK_TENSOR_S2G_INTR<dim, mode>] in
       def s2g.Name : DefaultAttrsIntrinsic<[], s2g.ArgsTy, s2g.IntrProp>;
+    foreach prefetch = [CP_ASYNC_BULK_TENSOR_PREFETCH_INTR<dim, mode>] in
+      def prefetch.Name : DefaultAttrsIntrinsic<[], prefetch.ArgsTy, prefetch.IntrProp>;
   }
 }
 
 
@@ -8,9 +8,9 @@
 //==============================================================================
 //
 // NOTE!
-// llvm/lib/ProfileData/CtxInstrContextNode.h and
+// llvm/include/llvm/ProfileData/CtxInstrContextNode.h and
 //   compiler-rt/lib/ctx_profile/CtxInstrContextNode.h
-// must be exact copies of each other
+// must be exact copies of each other.
 //
 // compiler-rt creates these objects as part of the instrumentation runtime for
 // contextual profiling. LLVM only consumes them to convert a contextual tree
@@ -114,4 +114,4 @@ class ContextNode final {
 };
 } // namespace ctx_profile
 } // namespace llvm
-#endif
+#endif
@@ -995,9 +995,9 @@ static bool selectDebugInstr(MachineInstr &I, MachineRegisterInfo &MRI,
     LLT Ty = MRI.getType(Reg);
     const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
     const TargetRegisterClass *RC =
-        RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+        dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
     if (!RC) {
-      const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+      const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
       RC = getRegClassForTypeOnBank(Ty, RB);
       if (!RC) {
         LLVM_DEBUG(
@@ -2590,14 +2590,14 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       const RegClassOrRegBank &RegClassOrBank =
         MRI.getRegClassOrRegBank(DefReg);
 
-      const TargetRegisterClass *DefRC
-        = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+      const TargetRegisterClass *DefRC =
+          dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
       if (!DefRC) {
         if (!DefTy.isValid()) {
           LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
           return false;
         }
-        const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+        const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
         DefRC = getRegClassForTypeOnBank(DefTy, RB);
         if (!DefRC) {
           LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
@@ -4677,7 +4677,7 @@ AArch64InstructionSelector::emitCSINC(Register Dst, Register Src1,
   // If we used a register class, then this won't necessarily have an LLT.
   // Compute the size based off whether or not we have a class or bank.
   unsigned Size;
-  if (const auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
+  if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
     Size = TRI.getRegSizeInBits(*RC);
   else
     Size = MRI.getType(Dst).getSizeInBits();
 
@@ -81,7 +81,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
 
   auto &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
   const TargetRegisterClass *RC =
-      RegClassOrBank.dyn_cast<const TargetRegisterClass*>();
+      dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
   if (RC) {
     const LLT Ty = MRI.getType(Reg);
     if (!Ty.isValid() || Ty.getSizeInBits() != 1)
@@ -91,7 +91,7 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
            RC->hasSuperClassEq(TRI.getBoolRC());
   }
 
-  const RegisterBank *RB = RegClassOrBank.get<const RegisterBank *>();
+  const RegisterBank *RB = cast<const RegisterBank *>(RegClassOrBank);
   return RB->getID() == AMDGPU::VCCRegBankID;
 }
 
@@ -233,15 +233,15 @@ bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   const RegClassOrRegBank &RegClassOrBank =
     MRI->getRegClassOrRegBank(DefReg);
 
-  const TargetRegisterClass *DefRC
-    = RegClassOrBank.dyn_cast<const TargetRegisterClass *>();
+  const TargetRegisterClass *DefRC =
+      dyn_cast<const TargetRegisterClass *>(RegClassOrBank);
   if (!DefRC) {
     if (!DefTy.isValid()) {
       LLVM_DEBUG(dbgs() << "PHI operand has no type, not a gvreg?\n");
       return false;
     }
 
-    const RegisterBank &RB = *RegClassOrBank.get<const RegisterBank *>();
+    const RegisterBank &RB = *cast<const RegisterBank *>(RegClassOrBank);
     DefRC = TRI.getRegClassForTypeOnBank(DefTy, RB);
     if (!DefRC) {
       LLVM_DEBUG(dbgs() << "PHI operand has unexpected size/bank\n");
@@ -2395,11 +2395,11 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
   Register Reg, const MachineRegisterInfo &MRI,
   const TargetRegisterInfo &TRI) const {
   const RegClassOrRegBank &RegClassOrBank = MRI.getRegClassOrRegBank(Reg);
-  if (auto *RB = RegClassOrBank.dyn_cast<const RegisterBank *>())
+  if (auto *RB = dyn_cast<const RegisterBank *>(RegClassOrBank))
     return RB;
 
   // Ignore the type, since we don't use vcc in artifacts.
-  if (auto *RC = RegClassOrBank.dyn_cast<const TargetRegisterClass *>())
+  if (auto *RC = dyn_cast<const TargetRegisterClass *>(RegClassOrBank))
     return &RBI.getRegBankFromRegClass(*RC, LLT());
   return nullptr;
 }
 
@@ -3682,10 +3682,10 @@ const TargetRegisterClass *
 SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
                                          const MachineRegisterInfo &MRI) const {
   const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(MO.getReg());
-  if (const RegisterBank *RB = RCOrRB.dyn_cast<const RegisterBank*>())
+  if (const RegisterBank *RB = dyn_cast<const RegisterBank *>(RCOrRB))
     return getRegClassForTypeOnBank(MRI.getType(MO.getReg()), *RB);
 
-  if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
+  if (const auto *RC = dyn_cast<const TargetRegisterClass *>(RCOrRB))
     return getAllocatableClass(RC);
 
   return nullptr;