Skip to content

Commit 7314e1d

Browse files
clementvalNoumanAmir657
authored andcommitted
[flang][cuda] Add conversion pattern for cuf.kernel_launch op (llvm#114129)
1 parent 1c1d806 commit 7314e1d

File tree

2 files changed

+132
-2
lines changed

2 files changed

+132
-2
lines changed

flang/lib/Optimizer/Transforms/CUFOpConversion.cpp

Lines changed: 68 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include "flang/Optimizer/Dialect/FIROps.h"
1616
#include "flang/Optimizer/HLFIR/HLFIROps.h"
1717
#include "flang/Optimizer/Support/DataLayout.h"
18+
#include "flang/Optimizer/Transforms/CUFCommon.h"
1819
#include "flang/Runtime/CUDA/allocatable.h"
1920
#include "flang/Runtime/CUDA/common.h"
2021
#include "flang/Runtime/CUDA/descriptor.h"
@@ -620,6 +621,69 @@ struct CufDataTransferOpConversion
620621
const mlir::SymbolTable &symtab;
621622
};
622623

624+
struct CUFLaunchOpConversion
625+
: public mlir::OpRewritePattern<cuf::KernelLaunchOp> {
626+
public:
627+
using OpRewritePattern::OpRewritePattern;
628+
629+
CUFLaunchOpConversion(mlir::MLIRContext *context,
630+
const mlir::SymbolTable &symTab)
631+
: OpRewritePattern(context), symTab{symTab} {}
632+
633+
mlir::LogicalResult
634+
matchAndRewrite(cuf::KernelLaunchOp op,
635+
mlir::PatternRewriter &rewriter) const override {
636+
mlir::Location loc = op.getLoc();
637+
auto idxTy = mlir::IndexType::get(op.getContext());
638+
auto zero = rewriter.create<mlir::arith::ConstantOp>(
639+
loc, rewriter.getIntegerType(32), rewriter.getI32IntegerAttr(0));
640+
auto gridSizeX =
641+
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridX());
642+
auto gridSizeY =
643+
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridY());
644+
auto gridSizeZ =
645+
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getGridZ());
646+
auto blockSizeX =
647+
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockX());
648+
auto blockSizeY =
649+
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockY());
650+
auto blockSizeZ =
651+
rewriter.create<mlir::arith::IndexCastOp>(loc, idxTy, op.getBlockZ());
652+
auto kernelName = mlir::SymbolRefAttr::get(
653+
rewriter.getStringAttr(cudaDeviceModuleName),
654+
{mlir::SymbolRefAttr::get(
655+
rewriter.getContext(),
656+
op.getCallee().getLeafReference().getValue())});
657+
mlir::Value clusterDimX, clusterDimY, clusterDimZ;
658+
if (auto funcOp = symTab.lookup<mlir::func::FuncOp>(
659+
op.getCallee().getLeafReference())) {
660+
if (auto clusterDimsAttr = funcOp->getAttrOfType<cuf::ClusterDimsAttr>(
661+
cuf::getClusterDimsAttrName())) {
662+
clusterDimX = rewriter.create<mlir::arith::ConstantIndexOp>(
663+
loc, clusterDimsAttr.getX().getInt());
664+
clusterDimY = rewriter.create<mlir::arith::ConstantIndexOp>(
665+
loc, clusterDimsAttr.getY().getInt());
666+
clusterDimZ = rewriter.create<mlir::arith::ConstantIndexOp>(
667+
loc, clusterDimsAttr.getZ().getInt());
668+
}
669+
}
670+
auto gpuLaunchOp = rewriter.create<mlir::gpu::LaunchFuncOp>(
671+
loc, kernelName, mlir::gpu::KernelDim3{gridSizeX, gridSizeY, gridSizeZ},
672+
mlir::gpu::KernelDim3{blockSizeX, blockSizeY, blockSizeZ}, zero,
673+
op.getArgs());
674+
if (clusterDimX && clusterDimY && clusterDimZ) {
675+
gpuLaunchOp.getClusterSizeXMutable().assign(clusterDimX);
676+
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
677+
gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
678+
}
679+
rewriter.replaceOp(op, gpuLaunchOp);
680+
return mlir::success();
681+
}
682+
683+
private:
684+
const mlir::SymbolTable &symTab;
685+
};
686+
623687
class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
624688
public:
625689
void runOnOperation() override {
@@ -637,7 +701,8 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
637701
fir::support::getOrSetDataLayout(module, /*allowDefaultLayout=*/false);
638702
fir::LLVMTypeConverter typeConverter(module, /*applyTBAA=*/false,
639703
/*forceUnifiedTBAATree=*/false, *dl);
640-
target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect>();
704+
target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
705+
mlir::gpu::GPUDialect>();
641706
cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
642707
patterns);
643708
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
@@ -656,5 +721,6 @@ void cuf::populateCUFToFIRConversionPatterns(
656721
patterns.insert<CufAllocOpConversion>(patterns.getContext(), &dl, &converter);
657722
patterns.insert<CufAllocateOpConversion, CufDeallocateOpConversion,
658723
CufFreeOpConversion>(patterns.getContext());
659-
patterns.insert<CufDataTransferOpConversion>(patterns.getContext(), symtab);
724+
patterns.insert<CufDataTransferOpConversion, CUFLaunchOpConversion>(
725+
patterns.getContext(), symtab);
660726
}
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
// RUN: fir-opt --split-input-file --cuf-convert %s | FileCheck %s
2+
3+
4+
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
5+
gpu.module @cuda_device_mod {
6+
gpu.func @_QPsub_device1() kernel {
7+
cf.br ^bb1
8+
^bb1: // pred: ^bb0
9+
gpu.return
10+
}
11+
gpu.func @_QPsub_device2(%arg0: !fir.ref<f32>) kernel {
12+
cf.br ^bb1(%arg0 : !fir.ref<f32>)
13+
^bb1(%0: !fir.ref<f32>): // pred: ^bb0
14+
%1 = fir.declare %0 {uniq_name = "_QFsub1Ei"} : (!fir.ref<f32>) -> !fir.ref<f32>
15+
%cst = arith.constant 2.000000e+00 : f32
16+
fir.store %cst to %1 : !fir.ref<f32>
17+
gpu.return
18+
}
19+
}
20+
21+
func.func @_QQmain() attributes {fir.bindc_name = "main"} {
22+
%0 = fir.alloca f32
23+
// CHECK: %[[ALLOCA:.*]] = fir.alloca f32
24+
%c1 = arith.constant 1 : index
25+
%c11_i32 = arith.constant 11 : i32
26+
%c6_i32 = arith.constant 6 : i32
27+
%c1_i32 = arith.constant 1 : i32
28+
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device1 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}}
29+
cuf.kernel_launch @cuda_device_mod::@_QPsub_device1<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>()
30+
31+
// CHECK: gpu.launch_func @cuda_device_mod::@_QPsub_device2 blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) dynamic_shared_memory_size %c0{{.*}} args(%[[ALLOCA]] : !fir.ref<f32>)
32+
cuf.kernel_launch @cuda_device_mod::@_QPsub_device2<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32>>>(%0) : (!fir.ref<f32>)
33+
return
34+
}
35+
36+
}
37+
38+
// -----
39+
40+
module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
41+
gpu.module @cuda_device_mod {
42+
gpu.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>>) kernel {
43+
gpu.return
44+
}
45+
}
46+
47+
func.func @_QMmod1Psub1(%arg0: !fir.ref<!fir.array<10xi32>> {cuf.data_attr = #cuf.cuda<device>, fir.bindc_name = "adev"}) attributes {cuf.cluster_dims = #cuf.cluster_dims<x = 2 : i64, y = 2 : i64, z = 1 : i64>, cuf.proc_attr = #cuf.cuda_proc<global>} {
48+
return
49+
}
50+
func.func @_QMmod1Phost_sub() {
51+
%c10 = arith.constant 10 : index
52+
%0 = cuf.alloc !fir.array<10xi32> {bindc_name = "adev", data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} -> !fir.ref<!fir.array<10xi32>>
53+
%1 = fir.shape %c10 : (index) -> !fir.shape<1>
54+
%2:2 = hlfir.declare %0(%1) {data_attr = #cuf.cuda<device>, uniq_name = "_QMmod1Fhost_subEadev"} : (!fir.ref<!fir.array<10xi32>>, !fir.shape<1>) -> (!fir.ref<!fir.array<10xi32>>, !fir.ref<!fir.array<10xi32>>)
55+
%c1_i32 = arith.constant 1 : i32
56+
%c10_i32 = arith.constant 10 : i32
57+
cuf.kernel_launch @_QMmod1Psub1<<<%c1_i32, %c1_i32, %c1_i32, %c10_i32, %c1_i32, %c1_i32>>>(%2#1) : (!fir.ref<!fir.array<10xi32>>)
58+
return
59+
}
60+
}
61+
62+
// CHECK-LABEL: func.func @_QMmod1Phost_sub()
63+
// CHECK: gpu.launch_func @cuda_device_mod::@_QMmod1Psub1 clusters in (%c2{{.*}}, %c2{{.*}}, %c1{{.*}})
64+

0 commit comments

Comments
 (0)