Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,

let arguments = (ins SymbolRefAttr:$callee, I32:$grid_x, I32:$grid_y,
I32:$grid_z, I32:$block_x, I32:$block_y, I32:$block_z,
Optional<I32>:$bytes, Optional<AnyIntegerType>:$stream,
Optional<I32>:$bytes, Optional<fir_ReferenceType>:$stream,
Variadic<AnyType>:$args, OptionalAttr<DictArrayAttr>:$arg_attrs,
OptionalAttr<DictArrayAttr>:$res_attrs);

Expand Down Expand Up @@ -237,6 +237,8 @@ def cuf_KernelLaunchOp : cuf_Op<"kernel_launch", [CallOpInterface,
*this, getNbNoArgOperand(), getArgs().size() - 1);
}
}];

let hasVerifier = 1;
}

def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
Expand Down
2 changes: 1 addition & 1 deletion flang/lib/Lower/ConvertCall.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -589,7 +589,7 @@ Fortran::lower::genCallOpAndResult(

mlir::Value stream; // stream is optional.
if (caller.getCallDescription().chevrons().size() > 3)
stream = fir::getBase(converter.genExprValue(
stream = fir::getBase(converter.genExprAddr(
caller.getCallDescription().chevrons()[3], stmtCtx));

builder.create<cuf::KernelLaunchOp>(
Expand Down
23 changes: 19 additions & 4 deletions flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,24 @@ llvm::LogicalResult cuf::DeallocateOp::verify() {
return mlir::success();
}

//===----------------------------------------------------------------------===//
// KernelLaunchop
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: KernelLaunchOp to be consistent with other ops.

//===----------------------------------------------------------------------===//

template <typename OpTy>
static llvm::LogicalResult checkStreamType(OpTy op) {
if (!op.getStream())
return mlir::success();
auto refTy = mlir::dyn_cast<fir::ReferenceType>(op.getStream().getType());
if (!refTy.getEleTy().isInteger(64))
return op.emitOpError("stream is expected to be a i64 reference");
return mlir::success();
}

llvm::LogicalResult cuf::KernelLaunchOp::verify() {
return checkStreamType(*this);
}

//===----------------------------------------------------------------------===//
// KernelOp
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -324,10 +342,7 @@ void cuf::SharedMemoryOp::build(
//===----------------------------------------------------------------------===//

llvm::LogicalResult cuf::StreamCastOp::verify() {
auto refTy = mlir::dyn_cast<fir::ReferenceType>(getStream().getType());
if (!refTy.getEleTy().isInteger(64))
return emitOpError("stream is expected to be a i64 reference");
return mlir::success();
return checkStreamType(*this);
}

// Tablegen operators
Expand Down
12 changes: 10 additions & 2 deletions flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -879,8 +879,15 @@ struct CUFLaunchOpConversion
gpuLaunchOp.getClusterSizeYMutable().assign(clusterDimY);
gpuLaunchOp.getClusterSizeZMutable().assign(clusterDimZ);
}
if (op.getStream())
gpuLaunchOp.getAsyncObjectMutable().assign(op.getStream());
if (op.getStream()) {
mlir::OpBuilder::InsertionGuard guard(rewriter);
rewriter.setInsertionPoint(gpuLaunchOp);
mlir::Value stream =
rewriter.create<cuf::StreamCastOp>(loc, op.getStream());
llvm::errs() << stream << "\n";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this intended output?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for catching this.

gpuLaunchOp.getAsyncDependenciesMutable().append(stream);
llvm::errs() << gpuLaunchOp << "\n";
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same here.

}
if (procAttr)
gpuLaunchOp->setAttr(cuf::getProcAttrName(), procAttr);
rewriter.replaceOp(op, gpuLaunchOp);
Expand Down Expand Up @@ -933,6 +940,7 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
/*forceUnifiedTBAATree=*/false, *dl);
target.addLegalDialect<fir::FIROpsDialect, mlir::arith::ArithDialect,
mlir::gpu::GPUDialect>();
target.addLegalOp<cuf::StreamCastOp>();
cuf::populateCUFToFIRConversionPatterns(typeConverter, *dl, symtab,
patterns);
if (mlir::failed(mlir::applyPartialConversion(getOperation(), target,
Expand Down
7 changes: 3 additions & 4 deletions flang/test/Fir/CUDA/cuda-launch.fir
Original file line number Diff line number Diff line change
Expand Up @@ -146,14 +146,13 @@ module attributes {gpu.container_module, dlti.dl_spec = #dlti.dl_spec<#dlti.dl_e
%1:2 = hlfir.declare %0 {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
%c1_i32 = arith.constant 1 : i32
%c0_i32 = arith.constant 0 : i32
%2 = fir.load %1#0 : !fir.ref<i64>
cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c0_i32, %2 : i64>>>()
cuf.kernel_launch @_QMdevptrPtest<<<%c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c1_i32, %c0_i32, %1#0 : !fir.ref<i64>>>>()
return
}
}

// CHECK-LABEL: func.func @_QQmain()
// CHECK: %[[STREAM:.*]] = fir.alloca i64 {bindc_name = "stream", uniq_name = "_QMtest_callFhostEstream"}
// CHECK: %[[DECL_STREAM:.*]]:2 = hlfir.declare %[[STREAM]] {uniq_name = "_QMtest_callFhostEstream"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
// CHECK: %[[STREAM_LOADED:.*]] = fir.load %[[DECL_STREAM]]#0 : !fir.ref<i64>
// CHECK: gpu.launch_func <%[[STREAM_LOADED]] : i64> @cuda_device_mod::@_QMdevptrPtest
// CHECK: %[[TOKEN:.*]] = cuf.stream_cast %[[DECL_STREAM]]#0 : <i64>
// CHECK: gpu.launch_func [%[[TOKEN]]] @cuda_device_mod::@_QMdevptrPtest
6 changes: 3 additions & 3 deletions flang/test/Lower/CUDA/cuda-kernel-calls.cuf
Original file line number Diff line number Diff line change
Expand Up @@ -45,8 +45,8 @@ contains
call dev_kernel0<<<10, 20, 2>>>()
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}>>>()

call dev_kernel0<<<10, 20, 2, 0>>>()
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %c0{{.*}}>>>()
call dev_kernel0<<<10, 20, 2, 0_8>>>()
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel0<<<%c10{{.*}}, %c1{{.*}}, %c1{{.*}}, %c20{{.*}}, %c1{{.*}}, %c1{{.*}}, %c2{{.*}}, %{{.*}} : !fir.ref<i64>>>>()

call dev_kernel1<<<1, 32>>>(a)
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}}) : (!fir.ref<f32>)
Expand All @@ -55,7 +55,7 @@ contains
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}>>>(%{{.*}})

call dev_kernel1<<<*,32,0,stream>>>(a)
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}, %c0{{.*}}, %{{.*}} : i64>>>(%{{.*}}) : (!fir.ref<f32>)
! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}, %c0{{.*}}, %{{.*}} : !fir.ref<i64>>>>(%{{.*}}) : (!fir.ref<f32>)

end

Expand Down
Loading