Skip to content

Commit a2899c4

Browse files
authored
[flang][cuda] Support data transfer with conversion (#153242)
When the rhs of the data transfer is from a different type, allocate a new temp on the host and first transfer the rhs to it. Then, use the elemental op created to do the conversion.
1 parent 6ae6c4f commit a2899c4

File tree

4 files changed

+87
-2
lines changed

4 files changed

+87
-2
lines changed

flang/include/flang/Lower/CUDA.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ cuf::DataAttributeAttr
6262
translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
6363
const Fortran::semantics::Symbol &sym);
6464

65+
bool isTransferWithConversion(mlir::Value rhs);
66+
6567
} // end namespace Fortran::lower
6668

6769
#endif // FORTRAN_LOWER_CUDA_H

flang/lib/Lower/Bridge.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4827,7 +4827,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
48274827

48284828
void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
48294829
const Fortran::evaluate::Assignment &assign,
4830-
hlfir::Entity &lhs, hlfir::Entity &rhs) {
4830+
hlfir::Entity &lhs, hlfir::Entity &rhs,
4831+
bool isWholeAllocatableAssignment,
4832+
bool keepLhsLengthInAllocatableAssignment) {
48314833
bool lhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs);
48324834
bool rhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs);
48334835

@@ -4892,6 +4894,28 @@ class FirConverter : public Fortran::lower::AbstractConverter {
48924894

48934895
// host = device
48944896
if (!lhsIsDevice && rhsIsDevice) {
4897+
if (Fortran::lower::isTransferWithConversion(rhs)) {
4898+
mlir::OpBuilder::InsertionGuard insertionGuard(builder);
4899+
auto elementalOp =
4900+
mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp());
4901+
assert(elementalOp && "expect elemental op");
4902+
auto designateOp =
4903+
*elementalOp.getBody()->getOps<hlfir::DesignateOp>().begin();
4904+
builder.setInsertionPoint(elementalOp);
4905+
// Create a temp to transfer the rhs before applying the conversion.
4906+
hlfir::Entity entity{designateOp.getMemref()};
4907+
auto [temp, cleanup] = hlfir::createTempFromMold(loc, builder, entity);
4908+
auto transferKindAttr = cuf::DataTransferKindAttr::get(
4909+
builder.getContext(), cuf::DataTransferKind::DeviceHost);
4910+
cuf::DataTransferOp::create(builder, loc, designateOp.getMemref(), temp,
4911+
/*shape=*/mlir::Value{}, transferKindAttr);
4912+
designateOp.getMemrefMutable().assign(temp);
4913+
builder.setInsertionPointAfter(elementalOp);
4914+
hlfir::AssignOp::create(builder, loc, elementalOp, lhs,
4915+
isWholeAllocatableAssignment,
4916+
keepLhsLengthInAllocatableAssignment);
4917+
return;
4918+
}
48954919
auto transferKindAttr = cuf::DataTransferKindAttr::get(
48964920
builder.getContext(), cuf::DataTransferKind::DeviceHost);
48974921
cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
@@ -5039,7 +5063,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
50395063
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
50405064
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
50415065
if (isCUDATransfer && !hasCUDAImplicitTransfer)
5042-
genCUDADataTransfer(builder, loc, assign, lhs, rhs);
5066+
genCUDADataTransfer(builder, loc, assign, lhs, rhs,
5067+
isWholeAllocatableAssignment,
5068+
keepLhsLengthInAllocatableAssignment);
50435069
else
50445070
hlfir::AssignOp::create(builder, loc, rhs, lhs,
50455071
isWholeAllocatableAssignment,

flang/lib/Lower/CUDA.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,12 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
155155
Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
156156
return cuf::getDataAttribute(mlirContext, cudaAttr);
157157
}
158+
159+
bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
160+
if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp()))
161+
if (llvm::hasSingleElement(elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
162+
llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
163+
llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) == 1)
164+
return true;
165+
return false;
166+
}

flang/test/Lower/CUDA/cuda-data-transfer.cuf

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,3 +474,51 @@ end
474474
! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>
475475
! CHECK: hlfir.assign %{{.*}} to %{{.*}} : f64, !fir.ref<f64>
476476
! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xf64>>
477+
478+
subroutine sub26(i, j, k)
479+
integer :: i, j, k
480+
real(2), dimension(i,j,k), device :: d
481+
real(4), dimension(i,j,k) :: hd
482+
483+
hd = d
484+
end subroutine
485+
486+
! CHECK-LABEL: func.func @_QPsub26
487+
! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<?x?x?xf16>, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} -> !fir.ref<!fir.array<?x?x?xf16>>
488+
! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} : (!fir.ref<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.ref<!fir.array<?x?x?xf16>>)
489+
! CHECK: %[[HD:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub26Ehd"} : (!fir.ref<!fir.array<?x?x?xf32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<?x?x?xf32>>)
490+
! CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<?x?x?xf16>, %8, %13, %18 {bindc_name = ".tmp", uniq_name = ""}
491+
! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.heap<!fir.array<?x?x?xf16>>)
492+
! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.box<!fir.array<?x?x?xf16>>, !fir.box<!fir.array<?x?x?xf16>>
493+
! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<?x?x?xf32> {
494+
! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
495+
! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box<!fir.array<?x?x?xf16>>, index, index, index) -> !fir.ref<f16>
496+
! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
497+
! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
498+
! CHECK: hlfir.yield_element %[[CONV]] : f32
499+
! CHECK: }
500+
! CHECK: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<?x?x?xf32>, !fir.box<!fir.array<?x?x?xf32>>
501+
502+
subroutine sub27()
503+
real(2), dimension(10, 20, 30), device :: d
504+
real(4), dimension(10, 20, 30) :: hd
505+
506+
hd = d
507+
end subroutine
508+
509+
! CHECK-LABEL: func.func @_QPsub27()
510+
! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<10x20x30xf16> {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} -> !fir.ref<!fir.array<10x20x30xf16>>
511+
! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} : (!fir.ref<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf16>>, !fir.ref<!fir.array<10x20x30xf16>>)
512+
! CHECK: %[[ALLOC_HD:.*]] = fir.alloca !fir.array<10x20x30xf32> {bindc_name = "hd", uniq_name = "_QFsub27Ehd"}
513+
! CHECK: %[[HD:.*]]:2 = hlfir.declare %[[ALLOC_HD]](%{{.*}}) {uniq_name = "_QFsub27Ehd"} : (!fir.ref<!fir.array<10x20x30xf32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf32>>, !fir.ref<!fir.array<10x20x30xf32>>)
514+
! CHECK: %[[ALLOC_TEMP:.*]] = fir.allocmem !fir.array<10x20x30xf16> {bindc_name = ".tmp", uniq_name = ""}
515+
! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC_TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.heap<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>)
516+
! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>
517+
! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<10x20x30xf32> {
518+
! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
519+
! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.heap<!fir.array<10x20x30xf16>>, index, index, index) -> !fir.ref<f16>
520+
! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
521+
! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
522+
! CHECK: hlfir.yield_element %[[CONV]] : f32
523+
! CHECK: }
524+
! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref<!fir.array<10x20x30xf32>>

0 commit comments

Comments
 (0)