Skip to content

Commit 36e13b2

Browse files
authored
Merge branch 'main' into openclcpp-friend-decl
2 parents 4336c58 + f494131 commit 36e13b2

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

47 files changed

+739
-76
lines changed

clang/lib/AST/ByteCode/Descriptor.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -473,9 +473,7 @@ bool Descriptor::hasTrivialDtor() const {
473473
bool Descriptor::isUnion() const { return isRecord() && ElemRecord->isUnion(); }
474474

475475
InitMap::InitMap(unsigned N)
476-
: UninitFields(N), Data(std::make_unique<T[]>(numFields(N))) {
477-
std::fill_n(data(), numFields(N), 0);
478-
}
476+
: UninitFields(N), Data(std::make_unique<T[]>(numFields(N))) {}
479477

480478
bool InitMap::initializeElement(unsigned I) {
481479
unsigned Bucket = I / PER_FIELD;

flang/include/flang/Lower/CUDA.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,8 @@ cuf::DataAttributeAttr
6262
translateSymbolCUFDataAttribute(mlir::MLIRContext *mlirContext,
6363
const Fortran::semantics::Symbol &sym);
6464

65+
bool isTransferWithConversion(mlir::Value rhs);
66+
6567
} // end namespace Fortran::lower
6668

6769
#endif // FORTRAN_LOWER_CUDA_H

flang/lib/Lower/Bridge.cpp

Lines changed: 28 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4827,7 +4827,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
48274827

48284828
void genCUDADataTransfer(fir::FirOpBuilder &builder, mlir::Location loc,
48294829
const Fortran::evaluate::Assignment &assign,
4830-
hlfir::Entity &lhs, hlfir::Entity &rhs) {
4830+
hlfir::Entity &lhs, hlfir::Entity &rhs,
4831+
bool isWholeAllocatableAssignment,
4832+
bool keepLhsLengthInAllocatableAssignment) {
48314833
bool lhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.lhs);
48324834
bool rhsIsDevice = Fortran::evaluate::HasCUDADeviceAttrs(assign.rhs);
48334835

@@ -4892,6 +4894,28 @@ class FirConverter : public Fortran::lower::AbstractConverter {
48924894

48934895
// host = device
48944896
if (!lhsIsDevice && rhsIsDevice) {
4897+
if (Fortran::lower::isTransferWithConversion(rhs)) {
4898+
mlir::OpBuilder::InsertionGuard insertionGuard(builder);
4899+
auto elementalOp =
4900+
mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp());
4901+
assert(elementalOp && "expect elemental op");
4902+
auto designateOp =
4903+
*elementalOp.getBody()->getOps<hlfir::DesignateOp>().begin();
4904+
builder.setInsertionPoint(elementalOp);
4905+
// Create a temp to transfer the rhs before applying the conversion.
4906+
hlfir::Entity entity{designateOp.getMemref()};
4907+
auto [temp, cleanup] = hlfir::createTempFromMold(loc, builder, entity);
4908+
auto transferKindAttr = cuf::DataTransferKindAttr::get(
4909+
builder.getContext(), cuf::DataTransferKind::DeviceHost);
4910+
cuf::DataTransferOp::create(builder, loc, designateOp.getMemref(), temp,
4911+
/*shape=*/mlir::Value{}, transferKindAttr);
4912+
designateOp.getMemrefMutable().assign(temp);
4913+
builder.setInsertionPointAfter(elementalOp);
4914+
hlfir::AssignOp::create(builder, loc, elementalOp, lhs,
4915+
isWholeAllocatableAssignment,
4916+
keepLhsLengthInAllocatableAssignment);
4917+
return;
4918+
}
48954919
auto transferKindAttr = cuf::DataTransferKindAttr::get(
48964920
builder.getContext(), cuf::DataTransferKind::DeviceHost);
48974921
cuf::DataTransferOp::create(builder, loc, rhsVal, lhsVal, shape,
@@ -5039,7 +5063,9 @@ class FirConverter : public Fortran::lower::AbstractConverter {
50395063
hlfir::Entity rhs = evaluateRhs(localStmtCtx);
50405064
hlfir::Entity lhs = evaluateLhs(localStmtCtx);
50415065
if (isCUDATransfer && !hasCUDAImplicitTransfer)
5042-
genCUDADataTransfer(builder, loc, assign, lhs, rhs);
5066+
genCUDADataTransfer(builder, loc, assign, lhs, rhs,
5067+
isWholeAllocatableAssignment,
5068+
keepLhsLengthInAllocatableAssignment);
50435069
else
50445070
hlfir::AssignOp::create(builder, loc, rhs, lhs,
50455071
isWholeAllocatableAssignment,

flang/lib/Lower/CUDA.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -155,3 +155,12 @@ cuf::DataAttributeAttr Fortran::lower::translateSymbolCUFDataAttribute(
155155
Fortran::semantics::GetCUDADataAttr(&sym.GetUltimate());
156156
return cuf::getDataAttribute(mlirContext, cudaAttr);
157157
}
158+
159+
bool Fortran::lower::isTransferWithConversion(mlir::Value rhs) {
160+
if (auto elOp = mlir::dyn_cast<hlfir::ElementalOp>(rhs.getDefiningOp()))
161+
if (llvm::hasSingleElement(elOp.getBody()->getOps<hlfir::DesignateOp>()) &&
162+
llvm::hasSingleElement(elOp.getBody()->getOps<fir::LoadOp>()) == 1 &&
163+
llvm::hasSingleElement(elOp.getBody()->getOps<fir::ConvertOp>()) == 1)
164+
return true;
165+
return false;
166+
}

flang/module/cudadevice.f90

Lines changed: 11 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -450,29 +450,29 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
450450
end function
451451
end interface
452452

453-
interface __double2int_rn
454-
attributes(device) integer function __double2int_rn(r) bind(c)
453+
interface __double2int_rd
454+
attributes(device) integer function __double2int_rd(r) bind(c, name='__nv_double2int_rd')
455455
!dir$ ignore_tkr (d) r
456456
double precision, value :: r
457457
end function
458458
end interface
459459

460-
interface __double2int_rz
461-
attributes(device) integer function __double2int_rz(r) bind(c)
460+
interface __double2int_rn
461+
attributes(device) integer function __double2int_rn(r) bind(c, name='__nv_double2int_rn')
462462
!dir$ ignore_tkr (d) r
463463
double precision, value :: r
464464
end function
465465
end interface
466466

467467
interface __double2int_ru
468-
attributes(device) integer function __double2int_ru(r) bind(c)
468+
attributes(device) integer function __double2int_ru(r) bind(c, name='__nv_double2int_ru')
469469
!dir$ ignore_tkr (d) r
470470
double precision, value :: r
471471
end function
472472
end interface
473473

474-
interface __double2int_rd
475-
attributes(device) integer function __double2int_rd(r) bind(c)
474+
interface __double2int_rz
475+
attributes(device) integer function __double2int_rz(r) bind(c, name='__nv_double2int_rz')
476476
!dir$ ignore_tkr (d) r
477477
double precision, value :: r
478478
end function
@@ -695,15 +695,15 @@ attributes(device) real(8) function sinpi(x) bind(c,name='__nv_sinpi')
695695
end function
696696
end interface
697697

698-
interface __dsqrt_ru
699-
attributes(device) double precision function __dsqrt_ru(x) bind(c)
698+
interface __dsqrt_rd
699+
attributes(device) double precision function __dsqrt_rd(x) bind(c, name='__nv_dsqrt_rd')
700700
!dir$ ignore_tkr (d) x
701701
double precision, value :: x
702702
end function
703703
end interface
704704

705-
interface __dsqrt_rd
706-
attributes(device) double precision function __dsqrt_rd(x) bind(c)
705+
interface __dsqrt_ru
706+
attributes(device) double precision function __dsqrt_ru(x) bind(c, name='__nv_dsqrt_ru')
707707
!dir$ ignore_tkr (d) x
708708
double precision, value :: x
709709
end function

flang/test/Lower/CUDA/cuda-data-transfer.cuf

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -474,3 +474,51 @@ end
474474
! CHECK: cuf.data_transfer %{{.*}} to %{{.*}} {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.box<!fir.heap<!fir.array<?xf64>>>>, !fir.box<!fir.array<?xf64>>
475475
! CHECK: hlfir.assign %{{.*}} to %{{.*}} : f64, !fir.ref<f64>
476476
! CHECK: fir.freemem %{{.*}} : !fir.heap<!fir.array<?xf64>>
477+
478+
subroutine sub26(i, j, k)
479+
integer :: i, j, k
480+
real(2), dimension(i,j,k), device :: d
481+
real(4), dimension(i,j,k) :: hd
482+
483+
hd = d
484+
end subroutine
485+
486+
! CHECK-LABEL: func.func @_QPsub26
487+
! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<?x?x?xf16>, %{{.*}}, %{{.*}}, %{{.*}} : index, index, index {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} -> !fir.ref<!fir.array<?x?x?xf16>>
488+
! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub26Ed"} : (!fir.ref<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.ref<!fir.array<?x?x?xf16>>)
489+
! CHECK: %[[HD:.*]]:2 = hlfir.declare %{{.*}}(%{{.*}}) {uniq_name = "_QFsub26Ehd"} : (!fir.ref<!fir.array<?x?x?xf32>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf32>>, !fir.ref<!fir.array<?x?x?xf32>>)
490+
! CHECK: %[[ALLOC:.*]] = fir.allocmem !fir.array<?x?x?xf16>, %8, %13, %18 {bindc_name = ".tmp", uniq_name = ""}
491+
! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<?x?x?xf16>>, !fir.shape<3>) -> (!fir.box<!fir.array<?x?x?xf16>>, !fir.heap<!fir.array<?x?x?xf16>>)
492+
! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.box<!fir.array<?x?x?xf16>>, !fir.box<!fir.array<?x?x?xf16>>
493+
! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<?x?x?xf32> {
494+
! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
495+
! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.box<!fir.array<?x?x?xf16>>, index, index, index) -> !fir.ref<f16>
496+
! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
497+
! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
498+
! CHECK: hlfir.yield_element %[[CONV]] : f32
499+
! CHECK: }
500+
! CHECK: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<?x?x?xf32>, !fir.box<!fir.array<?x?x?xf32>>
501+
502+
subroutine sub27()
503+
real(2), dimension(10, 20, 30), device :: d
504+
real(4), dimension(10, 20, 30) :: hd
505+
506+
hd = d
507+
end subroutine
508+
509+
! CHECK-LABEL: func.func @_QPsub27()
510+
! CHECK: %[[ALLOC_D:.*]] = cuf.alloc !fir.array<10x20x30xf16> {bindc_name = "d", data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} -> !fir.ref<!fir.array<10x20x30xf16>>
511+
! CHECK: %[[D:.*]]:2 = hlfir.declare %[[ALLOC_D]](%{{.*}}) {data_attr = #cuf.cuda<device>, uniq_name = "_QFsub27Ed"} : (!fir.ref<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf16>>, !fir.ref<!fir.array<10x20x30xf16>>)
512+
! CHECK: %[[ALLOC_HD:.*]] = fir.alloca !fir.array<10x20x30xf32> {bindc_name = "hd", uniq_name = "_QFsub27Ehd"}
513+
! CHECK: %[[HD:.*]]:2 = hlfir.declare %[[ALLOC_HD]](%{{.*}}) {uniq_name = "_QFsub27Ehd"} : (!fir.ref<!fir.array<10x20x30xf32>>, !fir.shape<3>) -> (!fir.ref<!fir.array<10x20x30xf32>>, !fir.ref<!fir.array<10x20x30xf32>>)
514+
! CHECK: %[[ALLOC_TEMP:.*]] = fir.allocmem !fir.array<10x20x30xf16> {bindc_name = ".tmp", uniq_name = ""}
515+
! CHECK: %[[TEMP:.*]]:2 = hlfir.declare %[[ALLOC_TEMP]](%{{.*}}) {uniq_name = ".tmp"} : (!fir.heap<!fir.array<10x20x30xf16>>, !fir.shape<3>) -> (!fir.heap<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>)
516+
! CHECK: cuf.data_transfer %[[D]]#0 to %[[TEMP]]#0 {transfer_kind = #cuf.cuda_transfer<device_host>} : !fir.ref<!fir.array<10x20x30xf16>>, !fir.heap<!fir.array<10x20x30xf16>>
517+
! CHECK: %[[ELE:.*]] = hlfir.elemental %{{.*}} unordered : (!fir.shape<3>) -> !hlfir.expr<10x20x30xf32> {
518+
! CHECK: ^bb0(%{{.*}}: index, %{{.*}}: index, %{{.*}}: index):
519+
! CHECK: %[[DESIGNATE:.*]] = hlfir.designate %[[TEMP]]#0 (%{{.*}}, %{{.*}}, %{{.*}}) : (!fir.heap<!fir.array<10x20x30xf16>>, index, index, index) -> !fir.ref<f16>
520+
! CHECK: %[[LOAD:.*]] = fir.load %[[DESIGNATE]] : !fir.ref<f16>
521+
! CHECK: %[[CONV:.*]] = fir.convert %[[LOAD]] : (f16) -> f32
522+
! CHECK: hlfir.yield_element %[[CONV]] : f32
523+
! CHECK: }
524+
! CHECKL: hlfir.assign %[[ELE]] to %[[HD]]#0 : !hlfir.expr<10x20x30xf32>, !fir.ref<!fir.array<10x20x30xf32>>

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ attributes(global) subroutine devsub()
5858
res = __ffs(al)
5959
res = __brev(ai)
6060
resl = __brev(al)
61+
6162
res = __clz(ai)
6263
res = __clz(al)
6364
af = __cosf(af)
@@ -69,9 +70,15 @@ attributes(global) subroutine devsub()
6970
af = __double2float_rz(ad)
7071
af = __double2float_ru(ad)
7172
af = __double2float_rd(ad)
73+
ai = __double2int_rd(ad)
74+
ai = __double2int_rn(ad)
75+
ai = __double2int_ru(ad)
76+
ai = __double2int_rz(ad)
7277
ai = __mul24(ai, ai)
7378
ai = __umul24(ai, ai)
7479
af = __powf(af, af)
80+
ad = __dsqrt_rd(ad)
81+
ad = __dsqrt_ru(ad)
7582
end
7683

7784
! CHECK-LABEL: func.func @_QPdevsub() attributes {cuf.proc_attr = #cuf.cuda_proc<global>}
@@ -129,9 +136,15 @@ end
129136
! CHECK: %{{.*}} = fir.call @__nv_double2float_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
130137
! CHECK: %{{.*}} = fir.call @__nv_double2float_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
131138
! CHECK: %{{.*}} = fir.call @__nv_double2float_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f32
139+
! CHECK: %{{.*}} = fir.call @__nv_double2int_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
140+
! CHECK: %{{.*}} = fir.call @__nv_double2int_rn(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
141+
! CHECK: %{{.*}} = fir.call @__nv_double2int_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
142+
! CHECK: %{{.*}} = fir.call @__nv_double2int_rz(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> i32
132143
! CHECK: %{{.*}} = fir.call @__nv_mul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32
133144
! CHECK: %{{.*}} = fir.call @__nv_umul24(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (i32, i32) -> i32
134145
! CHECK: %{{.*}} = fir.call @__nv_powf(%{{.*}}, %{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f32, f32) -> f32
146+
! CHECK: %{{.*}} = fir.call @__nv_dsqrt_rd(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
147+
! CHECK: %{{.*}} = fir.call @__nv_dsqrt_ru(%{{.*}}) proc_attrs<bind_c> fastmath<contract> : (f64) -> f64
135148

136149
subroutine host1()
137150
integer, device :: a(32)

libc/config/baremetal/aarch64/entrypoints.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -763,6 +763,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
763763
libc.src.math.bf16div
764764
libc.src.math.bf16divf
765765
libc.src.math.bf16divl
766+
libc.src.math.bf16fma
767+
libc.src.math.bf16fmaf
768+
libc.src.math.bf16fmal
766769
libc.src.math.bf16mul
767770
libc.src.math.bf16mulf
768771
libc.src.math.bf16mull
@@ -792,6 +795,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
792795
# math.h C++23 mixed bfloat16 and _Float128 entrypoints
793796
libc.src.math.bf16addf128
794797
libc.src.math.bf16divf128
798+
libc.src.math.bf16fmaf128
795799
libc.src.math.bf16mulf128
796800
libc.src.math.bf16subf128
797801
)

libc/config/baremetal/arm/entrypoints.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
766766
libc.src.math.bf16div
767767
libc.src.math.bf16divf
768768
libc.src.math.bf16divl
769+
libc.src.math.bf16fma
770+
libc.src.math.bf16fmaf
771+
libc.src.math.bf16fmal
769772
libc.src.math.bf16mul
770773
libc.src.math.bf16mulf
771774
libc.src.math.bf16mull
@@ -795,6 +798,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
795798
# math.h C++23 mixed bfloat16 and _Float128 entrypoints
796799
libc.src.math.bf16addf128
797800
libc.src.math.bf16divf128
801+
libc.src.math.bf16fmaf128
798802
libc.src.math.bf16mulf128
799803
libc.src.math.bf16subf128
800804
)

libc/config/baremetal/riscv/entrypoints.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -766,6 +766,9 @@ list(APPEND TARGET_LIBM_ENTRYPOINTS
766766
libc.src.math.bf16div
767767
libc.src.math.bf16divf
768768
libc.src.math.bf16divl
769+
libc.src.math.bf16fma
770+
libc.src.math.bf16fmaf
771+
libc.src.math.bf16fmal
769772
libc.src.math.bf16mul
770773
libc.src.math.bf16mulf
771774
libc.src.math.bf16mull
@@ -795,6 +798,7 @@ if(LIBC_TYPES_HAS_FLOAT128)
795798
# math.h C++23 mixed bfloat16 and _Float128 entrypoints
796799
libc.src.math.bf16addf128
797800
libc.src.math.bf16divf128
801+
libc.src.math.bf16fmaf128
798802
libc.src.math.bf16mulf128
799803
libc.src.math.bf16subf128
800804
)

0 commit comments

Comments
 (0)