From 41d353131e154e45fc2e68f95995f2c193de6f8e Mon Sep 17 00:00:00 2001 From: Valentin Clement Date: Thu, 13 Feb 2025 17:29:15 -0800 Subject: [PATCH] [flang][cuda] Lower clock64 to nvvm intrinsic --- flang/include/flang/Optimizer/Builder/IntrinsicCall.h | 1 + flang/lib/Optimizer/Builder/IntrinsicCall.cpp | 11 +++++++++++ flang/module/cudadevice.f90 | 5 +++++ flang/test/Lower/CUDA/cuda-device-proc.cuf | 5 +++++ 4 files changed, 22 insertions(+) diff --git a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h index 47e8a77fa6aec..65732ce7f3224 100644 --- a/flang/include/flang/Optimizer/Builder/IntrinsicCall.h +++ b/flang/include/flang/Optimizer/Builder/IntrinsicCall.h @@ -231,6 +231,7 @@ struct IntrinsicLibrary { void genCFProcPointer(llvm::ArrayRef); fir::ExtendedValue genCFunLoc(mlir::Type, llvm::ArrayRef); fir::ExtendedValue genCLoc(mlir::Type, llvm::ArrayRef); + mlir::Value genClock64(mlir::Type, llvm::ArrayRef); template fir::ExtendedValue genCPtrCompare(mlir::Type, llvm::ArrayRef); diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp index 3dc8d217ef38e..93744fa58ebc0 100644 --- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp +++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp @@ -209,6 +209,7 @@ static constexpr IntrinsicHandler handlers[]{ &I::genChdir, {{{"name", asAddr}, {"status", asAddr, handleDynamicOptional}}}, /*isElemental=*/false}, + {"clock64", &I::genClock64, {}, /*isElemental=*/false}, {"cmplx", &I::genCmplx, {{{"x", asValue}, {"y", asValue, handleDynamicOptional}}}}, @@ -3228,6 +3229,16 @@ IntrinsicLibrary::genChdir(std::optional resultType, return {}; } +// CLOCK64 +mlir::Value IntrinsicLibrary::genClock64(mlir::Type resultType, + llvm::ArrayRef args) { + constexpr llvm::StringLiteral funcName = "llvm.nvvm.read.ptx.sreg.clock64"; + mlir::MLIRContext *context = builder.getContext(); + mlir::FunctionType ftype = mlir::FunctionType::get(context, {}, {resultType}); + auto funcOp = builder.createFunction(loc, funcName, ftype); + return builder.create(loc, funcOp, args).getResult(0); +} + // CMPLX mlir::Value IntrinsicLibrary::genCmplx(mlir::Type resultType, llvm::ArrayRef args) { diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90 index 45b9f2c838638..ed126a1253908 100644 --- a/flang/module/cudadevice.f90 +++ b/flang/module/cudadevice.f90 @@ -628,5 +628,10 @@ attributes(device) pure integer function atomicdeci(address, val) end interface public :: atomicdec + interface + attributes(device) integer(8) function clock64() + end function + end interface + public :: clock64 end module diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf index 17a6a1d965640..6a5524102c0ea 100644 --- a/flang/test/Lower/CUDA/cuda-device-proc.cuf +++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf @@ -9,6 +9,7 @@ attributes(global) subroutine devsub() real(8) :: ad integer(4) :: ai integer(8) :: al + integer(8) :: time call syncthreads() call syncwarp(1) @@ -43,6 +44,8 @@ attributes(global) subroutine devsub() ai = atomicor(ai, 1_4) ai = atomicinc(ai, 1_4) ai = atomicdec(ai, 1_4) + + time = clock64() end ! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc} @@ -79,6 +82,8 @@ end ! CHECK: %{{.*}} = llvm.atomicrmw uinc_wrap %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 ! CHECK: %{{.*}} = llvm.atomicrmw udec_wrap %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32 +! CHECK: fir.call @llvm.nvvm.read.ptx.sreg.clock64() + subroutine host1() integer, device :: a(32) integer, device :: ret