llvm
diff --git a/‎clang/docs/HIPSupport.rst‎
Lines changed: 7 additions & 7 deletions b/‎clang/docs/HIPSupport.rst‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp‎
Lines changed: 1 addition & 1 deletion b/‎clang/tools/clang-linker-wrapper/ClangLinkerWrapper.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp‎
Lines changed: 0 additions & 2 deletions b/‎compiler-rt/lib/fuzzer/FuzzerDataFlowTrace.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎flang-rt/lib/cuda/descriptor.cpp‎
Lines changed: 8 additions & 0 deletions b/‎flang-rt/lib/cuda/descriptor.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎flang/include/flang/Lower/LoweringOptions.def‎
Lines changed: 3 additions & 0 deletions b/‎flang/include/flang/Lower/LoweringOptions.def‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h‎
Lines changed: 5 additions & 0 deletions b/‎flang/include/flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎flang/include/flang/Runtime/CUDA/descriptor.h‎
Lines changed: 4 additions & 0 deletions b/‎flang/include/flang/Runtime/CUDA/descriptor.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎flang/lib/Lower/ConvertCall.cpp‎
Lines changed: 14 additions & 0 deletions b/‎flang/lib/Lower/ConvertCall.cpp‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp‎
Lines changed: 15 additions & 0 deletions b/‎flang/lib/Optimizer/Builder/Runtime/CUDA/Descriptor.cpp‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎flang/test/Lower/CUDA/cuda-runtime-check.cuf‎
Lines changed: 22 additions & 0 deletions b/‎flang/test/Lower/CUDA/cuda-runtime-check.cuf‎
Lines changed: 22 additions & 0 deletions
@@ -17,7 +17,7 @@
 HIP Support
 =============
 
-HIP (Heterogeneous-Compute Interface for Portability) `<https://github.com/ROCm-Developer-Tools/HIP>`_ is
+HIP (Heterogeneous-Compute Interface for Portability) `<https://github.com/ROCm/HIP>`_ is
 a C++ Runtime API and Kernel Language. It enables developers to create portable applications for
 offloading computation to different hardware platforms from a single source code.
 
@@ -41,9 +41,9 @@ backend or the out-of-tree LLVM-SPIRV translator. The SPIR-V is then bundled and
 .. note::
    While Clang does not directly provide HIP support for NVIDIA GPUs and CPUs, these platforms are supported via other means:
 
-   - NVIDIA GPUs: HIP support is offered through the HIP project `<https://github.com/ROCm-Developer-Tools/HIP>`_, which provides a header-only library for translating HIP runtime APIs into CUDA runtime APIs. The code is subsequently compiled using NVIDIA's `nvcc`.
+   - NVIDIA GPUs: HIP support is offered through the HIP project `<https://github.com/ROCm/HIP>`_, which provides a header-only library for translating HIP runtime APIs into CUDA runtime APIs. The code is subsequently compiled using NVIDIA's `nvcc`.
 
-   - CPUs: HIP support is available through the HIP-CPU runtime library `<https://github.com/ROCm-Developer-Tools/HIP-CPU>`_. This header-only library enables CPUs to execute unmodified HIP code.
+   - CPUs: HIP support is available through the HIP-CPU runtime library `<https://github.com/ROCm/HIP-CPU>`_. This header-only library enables CPUs to execute unmodified HIP code.
 
 
 Example Usage
@@ -328,7 +328,7 @@ The `parallel_unsequenced_policy <https://en.cppreference.com/w/cpp/algorithm/ex
 maps relatively well to the execution model of AMD GPUs. This, coupled with the
 the availability and maturity of GPU accelerated algorithm libraries that
 implement most / all corresponding algorithms in the standard library
-(e.g. `rocThrust <https://github.com/ROCmSoftwarePlatform/rocThrust>`__), makes
+(e.g. `rocThrust <https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocthrust>`__), makes
 it feasible to provide seamless accelerator offload for supported algorithms,
 when an accelerated version exists. Thus, it becomes possible to easily access
 the computational resources of an AMD accelerator, via a well specified,
@@ -483,7 +483,7 @@ such as GPUs, work.
      allocation / deallocation functions with accelerator-aware equivalents,
      based on a pre-established table; the list of functions that can be
      interposed is available
-     `here <https://github.com/ROCmSoftwarePlatform/roc-stdpar#allocation--deallocation-interposition-status>`__;
+     `here <https://github.com/ROCm/roc-stdpar#allocation--deallocation-interposition-status>`__;
    - This is only run when compiling for the host.
 
 The second pass is optional.
@@ -627,7 +627,7 @@ Linux operating system. Support is synthesised in the following table:
 The minimum Linux kernel version for running in HMM mode is 6.4.
 
 The forwarding header can be obtained from
-`its GitHub repository <https://github.com/ROCmSoftwarePlatform/roc-stdpar>`_.
+`its GitHub repository <https://github.com/ROCm/roc-stdpar>`_.
 It will be packaged with a future `ROCm <https://rocm.docs.amd.com/en/latest/>`_
 release. Because accelerated algorithms are provided via
 `rocThrust <https://rocm.docs.amd.com/projects/rocThrust/en/latest/>`_, a
@@ -636,7 +636,7 @@ transitive dependency on
 can be obtained either by installing their associated components of the
 `ROCm <https://rocm.docs.amd.com/en/latest/>`_ stack, or from their respective
 repositories. The list algorithms that can be offloaded is available
-`here <https://github.com/ROCmSoftwarePlatform/roc-stdpar#algorithm-support-status>`_.
+`here <https://github.com/ROCm/roc-stdpar#algorithm-support-status>`_.
 
 HIP Specific Elements
 ---------------------
 
@@ -311,7 +311,7 @@ Error relocateOffloadSection(const ArgList &Args, StringRef Output) {
   ObjcopyArgs.emplace_back("--remove-section");
   ObjcopyArgs.emplace_back(".llvm.offloading");
   StringRef Prefix = "llvm";
-  auto Section = (Prefix + "llvm_offload_entries").str();
+  auto Section = (Prefix + "_offload_entries").str();
   // Rename the offloading entires to make them private to this link unit.
   ObjcopyArgs.emplace_back("--rename-section");
   ObjcopyArgs.emplace_back(
 
@@ -265,8 +265,6 @@ int CollectDataFlow(const std::string &DFTBinary, const std::string &DirPath,
     // we then request tags in [0,Size/2) and [Size/2, Size), and so on.
     // Function number => DFT.
     auto OutPath = DirPlusFile(DirPath, Hash(FileToVector(F.File)));
-    std::unordered_map<size_t, std::vector<uint8_t>> DFTMap;
-    std::unordered_set<std::string> Cov;
     Command Cmd;
     Cmd.addArgument(DFTBinary);
     Cmd.addArgument(F.File);
 
@@ -54,6 +54,14 @@ void RTDEF(CUFSyncGlobalDescriptor)(
   ((Descriptor *)devAddr, (Descriptor *)hostPtr, sourceFile, sourceLine);
 }
 
+void RTDEF(CUFDescriptorCheckSection)(
+    const Descriptor *desc, const char *sourceFile, int sourceLine) {
+  if (desc && !desc->IsContiguous()) {
+    Terminator terminator{sourceFile, sourceLine};
+    terminator.Crash("device array section argument is not contiguous");
+  }
+}
+
 RT_EXT_API_GROUP_END
 }
 } // namespace Fortran::runtime::cuda
@@ -63,5 +63,8 @@ ENUM_LOWERINGOPT(StackRepackArrays, unsigned, 1, 0)
 /// in the leading dimension.
 ENUM_LOWERINGOPT(RepackArraysWhole, unsigned, 1, 0)
 
+/// If true, CUDA Fortran runtime check is inserted.
+ENUM_LOWERINGOPT(CUDARuntimeCheck, unsigned, 1, 0)
+
 #undef LOWERINGOPT
 #undef ENUM_LOWERINGOPT
@@ -26,6 +26,11 @@ namespace fir::runtime::cuda {
 void genSyncGlobalDescriptor(fir::FirOpBuilder &builder, mlir::Location loc,
                              mlir::Value hostPtr);
 
+/// Generate runtime call to check the section of a descriptor and raise an
+/// error if it is not contiguous.
+void genDescriptorCheckSection(fir::FirOpBuilder &builder, mlir::Location loc,
+                               mlir::Value desc);
+
 } // namespace fir::runtime::cuda
 
 #endif // FORTRAN_OPTIMIZER_BUILDER_RUNTIME_CUDA_DESCRIPTOR_H_
@@ -37,6 +37,10 @@ void RTDECL(CUFDescriptorSync)(Descriptor *dst, const Descriptor *src,
 void RTDECL(CUFSyncGlobalDescriptor)(
     void *hostPtr, const char *sourceFile = nullptr, int sourceLine = 0);
 
+/// Check descriptor passed to a kernel.
+void RTDECL(CUFDescriptorCheckSection)(
+    const Descriptor *, const char *sourceFile = nullptr, int sourceLine = 0);
+
 } // extern "C"
 
 } // namespace Fortran::runtime::cuda
 
@@ -26,6 +26,7 @@
 #include "flang/Optimizer/Builder/IntrinsicCall.h"
 #include "flang/Optimizer/Builder/LowLevelIntrinsics.h"
 #include "flang/Optimizer/Builder/MutableBox.h"
+#include "flang/Optimizer/Builder/Runtime/CUDA/Descriptor.h"
 #include "flang/Optimizer/Builder/Runtime/Derived.h"
 #include "flang/Optimizer/Builder/Todo.h"
 #include "flang/Optimizer/Dialect/CUF/CUFOps.h"
@@ -543,6 +544,19 @@ Fortran::lower::genCallOpAndResult(
   fir::FortranProcedureFlagsEnumAttr procAttrs =
       caller.getProcedureAttrs(builder.getContext());
 
+  if (converter.getLoweringOptions().getCUDARuntimeCheck()) {
+    if (caller.getCallDescription().chevrons().empty()) {
+      for (auto [oper, arg] :
+           llvm::zip(operands, caller.getPassedArguments())) {
+        if (auto boxTy = mlir::dyn_cast<fir::BaseBoxType>(oper.getType())) {
+          const Fortran::semantics::Symbol *sym = caller.getDummySymbol(arg);
+          if (sym && Fortran::evaluate::IsCUDADeviceSymbol(*sym))
+            fir::runtime::cuda::genDescriptorCheckSection(builder, loc, oper);
+        }
+      }
+    }
+  }
+
   if (!caller.getCallDescription().chevrons().empty()) {
     // A call to a CUDA kernel with the chevron syntax.
 
 
@@ -32,3 +32,18 @@ void fir::runtime::cuda::genSyncGlobalDescriptor(fir::FirOpBuilder &builder,
       builder, loc, fTy, hostPtr, sourceFile, sourceLine)};
   builder.create<fir::CallOp>(loc, callee, args);
 }
+
+void fir::runtime::cuda::genDescriptorCheckSection(fir::FirOpBuilder &builder,
+                                                   mlir::Location loc,
+                                                   mlir::Value desc) {
+  mlir::func::FuncOp func =
+      fir::runtime::getRuntimeFunc<mkRTKey(CUFDescriptorCheckSection)>(loc,
+                                                                       builder);
+  auto fTy = func.getFunctionType();
+  mlir::Value sourceFile = fir::factory::locationToFilename(builder, loc);
+  mlir::Value sourceLine =
+      fir::factory::locationToLineNo(builder, loc, fTy.getInput(2));
+  llvm::SmallVector<mlir::Value> args{fir::runtime::createArguments(
+      builder, loc, fTy, desc, sourceFile, sourceLine)};
+  builder.create<fir::CallOp>(loc, func, args);
+}
@@ -0,0 +1,22 @@
+! RUN: bbc -emit-hlfir -fcuda %s -o - | FileCheck %s
+
+! Check insertion of runtime checks
+
+interface
+  subroutine foo(a)
+    real, device, dimension(:,:) :: a
+  end subroutine
+end interface
+
+  real, device, allocatable, dimension(:,:) :: a
+  allocate(a(10,10))
+  call foo(a(1:10,1:10:2))
+end
+
+subroutine foo(a)
+  real, device, dimension(:,:) :: a
+end subroutine
+
+! CHECK-LABEL: func.func @_QQmain()
+! CHECK: fir.call @_FortranACUFDescriptorCheckSection
+! CHECK: fir.call @_QPfoo
Original file line number	Diff line number	Diff line change
`@@ -54,6 +54,14 @@ void RTDEF(CUFSyncGlobalDescriptor)(`
`54`	`54`	`((Descriptor )devAddr, (Descriptor )hostPtr, sourceFile, sourceLine);`
`55`	`55`	`}`
`56`	`56`
	`57`	`+void RTDEF(CUFDescriptorCheckSection)(`
	`58`	`+ const Descriptor desc, const char sourceFile, int sourceLine) {`
	`59`	`+ if (desc && !desc->IsContiguous()) {`
	`60`	`+ Terminator terminator{sourceFile, sourceLine};`
	`61`	`+ terminator.Crash("device array section argument is not contiguous");`
	`62`	`+ }`
	`63`	`+}`
	`64`	`+`
`57`	`65`	`RT_EXT_API_GROUP_END`
`58`	`66`	`}`
`59`	`67`	`} // namespace Fortran::runtime::cuda`