Skip to content

Commit 399cb35

Browse files
clementvalkcloudy0717
authored andcommitted
[flang][cuda] Add alignment to shared memory operation (llvm#170372)
Shared memory for TMA operation needs to be align to 16. Add ability to set an alignment on the cuf.shared_memory operation.
1 parent f38db1d commit 399cb35

File tree

4 files changed

+21
-5
lines changed

4 files changed

+21
-5
lines changed

flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -350,15 +350,14 @@ def cuf_SharedMemoryOp
350350
let arguments = (ins TypeAttr:$in_type, OptionalAttr<StrAttr>:$uniq_name,
351351
OptionalAttr<StrAttr>:$bindc_name, Variadic<AnyIntegerType>:$typeparams,
352352
Variadic<AnyIntegerType>:$shape,
353-
Optional<AnyIntegerType>:$offset // offset in bytes from the shared memory
354-
// base address.
355-
);
353+
// offset in bytes from the shared memory base address.
354+
Optional<AnyIntegerType>:$offset, OptionalAttr<I64Attr>:$alignment);
356355

357356
let results = (outs fir_ReferenceType:$ptr);
358357

359358
let assemblyFormat = [{
360359
(`[` $offset^ `:` type($offset) `]`)? $in_type (`(` $typeparams^ `:` type($typeparams) `)`)?
361-
(`,` $shape^ `:` type($shape) )? attr-dict `->` qualified(type($ptr))
360+
(`,` $shape^ `:` type($shape) )? (`align` $alignment^ )? attr-dict `->` qualified(type($ptr))
362361
}];
363362

364363
let builders = [OpBuilder<(ins "mlir::Type":$inType,

flang/lib/Optimizer/Builder/CUDAIntrinsicCall.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
#include "flang/Evaluate/common.h"
1818
#include "flang/Optimizer/Builder/FIRBuilder.h"
1919
#include "flang/Optimizer/Builder/MutableBox.h"
20+
#include "flang/Optimizer/Dialect/CUF/CUFOps.h"
21+
#include "flang/Optimizer/HLFIR/HLFIROps.h"
2022
#include "mlir/Dialect/Index/IR/IndexOps.h"
2123
#include "mlir/Dialect/SCF/IR/SCF.h"
2224
#include "mlir/Dialect/Vector/IR/VectorOps.h"
@@ -1489,13 +1491,21 @@ void CUDAIntrinsicLibrary::genTMABulkG2S(
14891491
builder, loc, dst, src, barrier, fir::getBase(args[3]), {}, {});
14901492
}
14911493

1494+
static void setAlignment(mlir::Value ptr, unsigned alignment) {
1495+
if (auto declareOp = mlir::dyn_cast<hlfir::DeclareOp>(ptr.getDefiningOp()))
1496+
if (auto sharedOp = mlir::dyn_cast<cuf::SharedMemoryOp>(
1497+
declareOp.getMemref().getDefiningOp()))
1498+
sharedOp.setAlignment(alignment);
1499+
}
1500+
14921501
static void genTMABulkLoad(fir::FirOpBuilder &builder, mlir::Location loc,
14931502
mlir::Value barrier, mlir::Value src,
14941503
mlir::Value dst, mlir::Value nelem,
14951504
mlir::Value eleSize) {
14961505
mlir::Value size = mlir::arith::MulIOp::create(builder, loc, nelem, eleSize);
14971506
auto llvmPtrTy = mlir::LLVM::LLVMPointerType::get(builder.getContext());
14981507
barrier = builder.createConvert(loc, llvmPtrTy, barrier);
1508+
setAlignment(dst, 16);
14991509
dst = builder.createConvert(loc, llvmPtrTy, dst);
15001510
src = builder.createConvert(loc, llvmPtrTy, src);
15011511
mlir::NVVM::InlinePtxOp::create(

flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -333,7 +333,7 @@ void cuf::SharedMemoryOp::build(
333333
bindcName.empty() ? mlir::StringAttr{} : builder.getStringAttr(bindcName);
334334
build(builder, result, wrapAllocaResultType(inType),
335335
mlir::TypeAttr::get(inType), nameAttr, bindcAttr, typeparams, shape,
336-
/*offset=*/mlir::Value{});
336+
/*offset=*/mlir::Value{}, /*alignment=*/mlir::IntegerAttr{});
337337
result.addAttributes(attributes);
338338
}
339339

flang/test/Lower/CUDA/cuda-device-proc.cuf

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -538,6 +538,7 @@ end subroutine
538538
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c4
539539
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
540540
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
541+
! CHECK: cuf.shared_memory !fir.array<1024xcomplex<f32>> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_c4Etmp"} -> !fir.ref<!fir.array<1024xcomplex<f32>>>
541542
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
542543
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
543544
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -557,6 +558,7 @@ end subroutine
557558
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_c8
558559
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_c8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
559560
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_c8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
561+
! CHECK: cuf.shared_memory !fir.array<1024xcomplex<f64>> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_c8Etmp"} -> !fir.ref<!fir.array<1024xcomplex<f64>>>
560562
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
561563
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 16 : i32
562564
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -576,6 +578,7 @@ end subroutine
576578
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i4
577579
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
578580
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
581+
! CHECK: cuf.shared_memory !fir.array<1024xi32> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_i4Etmp"} -> !fir.ref<!fir.array<1024xi32>>
579582
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
580583
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
581584
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -595,6 +598,7 @@ end subroutine
595598
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_i8
596599
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_i8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
597600
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_i8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
601+
! CHECK: cuf.shared_memory !fir.array<1024xi64> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_i8Etmp"} -> !fir.ref<!fir.array<1024xi64>>
598602
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
599603
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
600604
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -614,6 +618,7 @@ end subroutine
614618
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r2
615619
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r2Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
616620
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r2Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
621+
! CHECK: cuf.shared_memory !fir.array<1024xf16> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_r2Etmp"} -> !fir.ref<!fir.array<1024xf16>>
617622
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
618623
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 2 : i32
619624
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -633,6 +638,7 @@ end subroutine
633638
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r4
634639
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r4Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
635640
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r4Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
641+
! CHECK: cuf.shared_memory !fir.array<1024xf32> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_r4Etmp"} -> !fir.ref<!fir.array<1024xf32>>
636642
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
637643
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 4 : i32
638644
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32
@@ -652,6 +658,7 @@ end subroutine
652658
! CHECK-LABEL: func.func @_QPtest_tma_bulk_load_r8
653659
! CHECK: %[[BARRIER:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<shared>, uniq_name = "_QFtest_tma_bulk_load_r8Ebarrier1"} : (!fir.ref<i64>) -> (!fir.ref<i64>, !fir.ref<i64>)
654660
! CHECK: %[[ELEM_COUNT:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QFtest_tma_bulk_load_r8Eelem_count"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
661+
! CHECK: cuf.shared_memory !fir.array<1024xf64> align 16 {bindc_name = "tmp", uniq_name = "_QFtest_tma_bulk_load_r8Etmp"} -> !fir.ref<!fir.array<1024xf64>>
655662
! CHECK: %[[COUNT:.*]] = fir.load %[[ELEM_COUNT]]#0 : !fir.ref<i32>
656663
! CHECK: %[[ELEM_SIZE:.*]] = arith.constant 8 : i32
657664
! CHECK: %[[SIZE:.*]] = arith.muli %[[COUNT]], %[[ELEM_SIZE]] : i32

0 commit comments

Comments
 (0)