Skip to content

Commit ee05440

Browse files
authored
[flang][cuda] Carry over the cuf.proc_attr attribute to gpu.launch_func (#124325)
1 parent ae8b560 commit ee05440

File tree

2 files changed

+26
-0
lines changed

2 files changed

+26
-0
lines changed

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -810,6 +810,7 @@ struct CUFLaunchOpConversion
810810
rewriter.getContext(),
811811
op.getCallee().getLeafReference().getValue())});
812812
mlir::Value clusterDimX, clusterDimY, clusterDimZ;
813+
cuf::ProcAttributeAttr procAttr;
813814
if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
814815
op.getCallee().getLeafReference())) {
815816
if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
@@ -821,6 +822,8 @@ struct CUFLaunchOpConversion
821822
clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
822823
loc, clusterDimsAttr.getZ().getInt());
823824
}
825+
procAttr =
826+
funcOp->getAttrOfType<cuf::ProcAttributeAttr>(cuf::getProcAttrName());
824827
}
825828
llvm::SmallVector<mlir::Value> args;
826829
for (mlir::Value arg : op.getArgs()) {
@@ -855,6 +858,8 @@ struct CUFLaunchOpConversion
855858
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
856859
gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
857860
}
861+
if (procAttr)
862+
gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
858863
rewriter.replaceOp(op, gpuLaunchOp);
859864
return mlir::success();
860865
}

flang/test/Fir/CUDA/cuda-launch.fir

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,24 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
104104
// CHECK: %[[DEVADDR:.*]] = fir.call @_FortranACUFGetDeviceAddress(%[[CONV_ADDR]], %{{.*}}, %{{.*}}) : (!fir.llvm_ptr<i8>, !fir.ref<i8>, i32) -> !fir.llvm_ptr<i8>
105105
// CHECK: %[[CONV_DEVADDR:.*]] = fir.convert %[[DEVADDR]] : (!fir.llvm_ptr<i8>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>
106106
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} args(%[[CONV_DEVADDR]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?xf32>>>>)
107+
108+
// -----
109+
110+
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
111+
gpu.module @cuda_device_mod {
112+
gpu.func @_QMdevptrPtest() kernel {
113+
gpu.return
114+
}
115+
}
116+
func.func @_QMdevptrPtest() attributes {cuf.proc_attr = #cuf.cuda_proc<grid_global>} {
117+
return
118+
}
119+
func.func @_QQmain() {
120+
%c1_i32 = arith.constant 1 : i32
121+
cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
122+
return
123+
}
124+
}
125+
126+
// CHECK-LABEL: func.func @_QQmain()
127+
// CHECK: gpu.launch_func @cuda_device_mod::@_QMdevptrPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %{{.*}} {cuf.proc_attr = #cuf.cuda_proc<grid_global>}

0 commit comments

Comments
 (0)