Skip to content

Commit adc0a2c

Browse files
[mlir][acc] Introduce acc data bounds accessors (#156545)
Add acc.get_lowerbound, acc.get_upperbound, acc.get_stride, and acc.get_extent operations to extract information from acc bounds. This simplifies the arguments needed for recipes when handling slices and makes bound information consistent with data clauses. Update recipe documentation to clarify argument ordering and add examples demonstrating slice handling with bounds arguments.
1 parent 4d72bb3 commit adc0a2c

File tree

2 files changed

+263
-46
lines changed

2 files changed

+263
-46
lines changed

mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td

Lines changed: 203 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -505,6 +505,84 @@ def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
505505
];
506506
}
507507

508+
//===----------------------------------------------------------------------===//
509+
// DataBounds accessor operations
510+
//===----------------------------------------------------------------------===//
511+
512+
def OpenACC_GetLowerboundOp : OpenACC_Op<"get_lowerbound", [NoMemoryEffect]> {
513+
let summary = "Extract lowerbound from OpenACC data bounds.";
514+
let description = [{
515+
This operation extracts the lowerbound value from an `acc.bounds` value.
516+
If the data bounds does not have a lowerbound specified, it means it is zero.
517+
518+
Example:
519+
```mlir
520+
%lb = acc.get_lowerbound %bounds : (!acc.data_bounds_ty) -> index
521+
```
522+
}];
523+
524+
let arguments = (ins OpenACC_DataBoundsType:$bounds);
525+
let results = (outs Index:$result);
526+
527+
let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
528+
}
529+
530+
def OpenACC_GetUpperboundOp : OpenACC_Op<"get_upperbound", [NoMemoryEffect]> {
531+
let summary = "Extract upperbound from OpenACC data bounds.";
532+
let description = [{
533+
This operation extracts the upperbound value from an `acc.bounds` value.
534+
If the data bounds does not have an upperbound specified, this operation
535+
uses the extent to compute it.
536+
537+
Example:
538+
```mlir
539+
%ub = acc.get_upperbound %bounds : (!acc.data_bounds_ty) -> index
540+
```
541+
}];
542+
543+
let arguments = (ins OpenACC_DataBoundsType:$bounds);
544+
let results = (outs Index:$result);
545+
546+
let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
547+
}
548+
549+
def OpenACC_GetStrideOp : OpenACC_Op<"get_stride", [NoMemoryEffect]> {
550+
let summary = "Extract stride from OpenACC data bounds.";
551+
let description = [{
552+
This operation extracts the stride value from an `acc.bounds` value.
553+
If the data bounds does not have a stride specified, it defaults to 1.
554+
555+
Example:
556+
```mlir
557+
%stride = acc.get_stride %bounds : (!acc.data_bounds_ty) -> index
558+
```
559+
}];
560+
561+
let arguments = (ins OpenACC_DataBoundsType:$bounds);
562+
let results = (outs Index:$result);
563+
564+
let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
565+
}
566+
567+
def OpenACC_GetExtentOp : OpenACC_Op<"get_extent", [NoMemoryEffect]> {
568+
let summary = "Extract extent from OpenACC data bounds.";
569+
let description = [{
570+
This operation extracts the extent value from an `acc.bounds` value.
571+
If the data bounds does not have an extent specified, it is computed
572+
from the upperbound.
573+
574+
Example:
575+
```mlir
576+
%extent = acc.get_extent %bounds : (!acc.data_bounds_ty) -> index
577+
```
578+
}];
579+
580+
let arguments = (ins OpenACC_DataBoundsType:$bounds);
581+
let results = (outs Index:$result);
582+
583+
let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
584+
}
585+
508586
// Data entry operation does not refer to OpenACC spec terminology, but to
509587
// terminology used in this dialect. It refers to data operations that will
510588
// appear before data or compute region. It will be used as the base of acc
@@ -1180,30 +1258,48 @@ def OpenACC_PrivateRecipeOp
11801258
1. The initializer region specifies how to allocate and initialize a new
11811259
private value. For example in Fortran, a derived-type might have a
11821260
default initialization. The region has an argument that contains the
1183-
value that need to be privatized. This is useful if the type is not
1184-
known at compile time and the private value is needed to create its
1185-
copy.
1261+
original value that needs to be privatized, followed by bounds arguments
1262+
(if any) in order from innermost to outermost dimension. The region
1263+
must yield the privatized copy.
11861264
2. The destroy region specifies how to destruct the value when it reaches
1187-
its end of life. It takes the privatized value as argument.
1265+
its end of life. It takes the original value, the privatized value, and
1266+
bounds arguments (if any) in the same order as the init region.
11881267

11891268
A single privatization recipe can be used for multiple operand if they have
11901269
the same type and do not require a specific default initialization.
11911270

11921271
Example:
11931272

11941273
```mlir
1195-
acc.private.recipe @privatization_f32 : f32 init {
1196-
^bb0(%0: f32):
1274+
acc.private.recipe @privatization_memref : memref<10x20xf32> init {
1275+
^bb0(%original: memref<10x20xf32>):
11971276
// init region contains a sequence of operations to create and
1198-
// initialize the copy if needed. It yields the create copy.
1277+
// initialize the copy. It yields the privatized copy.
1278+
%alloca = memref.alloca() : memref<10x20xf32>
1279+
acc.yield %alloca : memref<10x20xf32>
1280+
} destroy {
1281+
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
1282+
// destroy region is empty since alloca is automatically cleaned up
1283+
acc.terminator
1284+
}
1285+
1286+
// Example with bounds for array slicing:
1287+
acc.private.recipe @privatization_slice : memref<10x20xf32> init {
1288+
^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1289+
// Extract bounds and create appropriately sized allocation
1290+
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1291+
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1292+
%slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1293+
// ... base pointer adjustment logic ...
1294+
acc.yield %result : memref<10x20xf32>
11991295
} destroy {
1200-
^bb0(%0: f32)
1201-
// destroy region contains a sequences of operations to destruct the
1202-
// created copy.
1296+
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1297+
// Cleanup is automatic for alloca-based allocations
1298+
acc.terminator
12031299
}
12041300

12051301
// The privatization symbol is then used in the corresponding operation.
1206-
acc.parallel private(@privatization_f32 -> %a : f32) {
1302+
acc.parallel private(@privatization_memref -> %a : memref<10x20xf32>) {
12071303
}
12081304
```
12091305
}];
@@ -1239,38 +1335,64 @@ def OpenACC_FirstprivateRecipeOp
12391335
1. The initializer region specifies how to allocate and initialize a new
12401336
private value. For example in Fortran, a derived-type might have a
12411337
default initialization. The region has an argument that contains the
1242-
value that need to be privatized. This is useful if the type is not
1243-
known at compile time and the private value is needed to create its
1244-
copy.
1338+
original value that needs to be privatized, followed by bounds arguments
1339+
(if any) in order from innermost to outermost dimension. The region must
1340+
yield the privatized copy.
12451341
2. The copy region specifies how to copy the initial value to the newly
1246-
created private value. It takes the initial value and the privatized
1247-
value as arguments.
1342+
created private value. It takes the original value, the privatized
1343+
value, followed by bounds arguments (if any) in the same order.
12481344
3. The destroy region specifies how to destruct the value when it reaches
1249-
its end of life. It takes the privatized value as argument. It is
1250-
optional.
1345+
its end of life. It takes the original value, the privatized value, and
1346+
bounds arguments (if any) in the same order. It is optional.
12511347

12521348
A single privatization recipe can be used for multiple operand if they have
12531349
the same type and do not require a specific default initialization.
12541350

12551351
Example:
12561352

12571353
```mlir
1258-
acc.firstprivate.recipe @privatization_f32 : f32 init {
1259-
^bb0(%0: f32):
1354+
acc.firstprivate.recipe @firstprivate_memref : memref<10x20xf32> init {
1355+
^bb0(%original: memref<10x20xf32>):
12601356
// init region contains a sequence of operations to create and
1261-
// initialize the copy if needed. It yields the create copy.
1357+
// initialize the copy. It yields the privatized copy.
1358+
%alloca = memref.alloca() : memref<10x20xf32>
1359+
acc.yield %alloca : memref<10x20xf32>
12621360
} copy {
1263-
^bb0(%0: f32, %1: !llvm.ptr):
1361+
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
12641362
// copy region contains a sequence of operations to copy the initial value
12651363
// of the firstprivate value to the newly created value.
1364+
memref.copy %original, %privatized : memref<10x20xf32> to memref<10x20xf32>
1365+
acc.terminator
12661366
} destroy {
1267-
^bb0(%0: f32)
1268-
// destroy region contains a sequences of operations to destruct the
1269-
// created copy.
1367+
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
1368+
// destroy region is empty since alloca is automatically cleaned up
1369+
acc.terminator
1370+
}
1371+
1372+
// Example with bounds for array slicing:
1373+
acc.firstprivate.recipe @firstprivate_slice : memref<10x20xf32> init {
1374+
^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1375+
// Extract bounds and create appropriately sized allocation
1376+
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1377+
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1378+
%slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1379+
// ... base pointer adjustment logic ...
1380+
acc.yield %result : memref<10x20xf32>
1381+
} copy {
1382+
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1383+
// Copy the slice portion from original to privatized
1384+
%lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
1385+
%lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
1386+
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1387+
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1388+
%subview = memref.subview %original[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1389+
: memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1390+
// Copy subview to privatized...
1391+
acc.terminator
12701392
}
12711393

12721394
// The privatization symbol is then used in the corresponding operation.
1273-
acc.parallel firstprivate(@privatization_f32 -> %a : f32) {
1395+
acc.parallel firstprivate(@firstprivate_memref -> %a : memref<10x20xf32>) {
12741396
}
12751397
```
12761398
}];
@@ -1305,40 +1427,75 @@ def OpenACC_ReductionRecipeOp
13051427
mandatory regions and one optional region.
13061428

13071429
1. The initializer region specifies how to initialize the local reduction
1308-
value. The region has a first argument that contains the value of the
1309-
reduction accumulator at the start of the reduction. It is expected to
1310-
`acc.yield` the new value. Extra arguments can be added to deal with
1311-
dynamic arrays.
1312-
2. The reduction region contains a sequences of operations to combine two
1313-
values of the reduction type into one. It has at least two arguments
1314-
and it is expected to `acc.yield` the combined value. Extra arguments
1315-
can be added to deal with dynamic arrays.
1430+
value. The region has a first argument that contains the original value
1431+
that needs to be reduced, followed by bounds arguments (if any) in order
1432+
from innermost to outermost dimension. It is expected to `acc.yield` the
1433+
initialized reduction value.
1434+
2. The combiner region contains a sequence of operations to combine two
1435+
values of the reduction type into one. It has the first reduction value,
1436+
the second reduction value, followed by bounds arguments (if any) in the
1437+
same order. It is expected to `acc.yield` the combined value.
13161438
3. The optional destroy region specifies how to destruct the value when it
1317-
reaches its end of life. It takes the reduction value as argument.
1439+
reaches its end of life. It takes the original value, the reduction value,
1440+
and bounds arguments (if any) in the same order.
13181441

13191442
Example:
13201443

13211444
```mlir
1322-
acc.reduction.recipe @reduction_add_i64 : i64 reduction_operator<add> init {
1323-
^bb0(%0: i64):
1445+
acc.reduction.recipe @reduction_add_memref : memref<10x20xf32> reduction_operator<add> init {
1446+
^bb0(%original: memref<10x20xf32>):
13241447
// init region contains a sequence of operations to initialize the local
13251448
// reduction value as specified in 2.5.15
1326-
%c0 = arith.constant 0 : i64
1327-
acc.yield %c0 : i64
1449+
%alloca = memref.alloca() : memref<10x20xf32>
1450+
%cst = arith.constant 0.0 : f32
1451+
linalg.fill ins(%cst : f32) outs(%alloca : memref<10x20xf32>)
1452+
acc.yield %alloca : memref<10x20xf32>
13281453
} combiner {
1329-
^bb0(%0: i64, %1: i64)
1454+
^bb0(%lhs: memref<10x20xf32>, %rhs: memref<10x20xf32>):
13301455
// combiner region contains a sequence of operations to combine
13311456
// two values into one.
1332-
%2 = arith.addi %0, %1 : i64
1333-
acc.yield %2 : i64
1457+
linalg.add ins(%lhs, %rhs : memref<10x20xf32>, memref<10x20xf32>)
1458+
outs(%lhs : memref<10x20xf32>)
1459+
acc.yield %lhs : memref<10x20xf32>
13341460
} destroy {
1335-
^bb0(%0: i64)
1336-
// destroy region contains a sequence of operations to destruct the
1337-
// created copy.
1461+
^bb0(%original: memref<10x20xf32>, %reduction: memref<10x20xf32>):
1462+
// destroy region is empty since alloca is automatically cleaned up
1463+
acc.terminator
1464+
}
1465+
1466+
// Example with bounds for array slicing:
1467+
acc.reduction.recipe @reduction_add_slice : memref<10x20xf32> reduction_operator<add> init {
1468+
^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1469+
// Extract bounds and create appropriately sized allocation
1470+
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1471+
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1472+
%slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1473+
%cst = arith.constant 0.0 : f32
1474+
linalg.fill ins(%cst : f32) outs(%slice_alloc : memref<?x?xf32>)
1475+
// ... base pointer adjustment logic ...
1476+
acc.yield %result : memref<10x20xf32>
1477+
} combiner {
1478+
^bb0(%lhs: memref<10x20xf32>, %rhs: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1479+
// Extract bounds to operate only on the slice portion
1480+
%lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
1481+
%lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
1482+
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1483+
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1484+
1485+
// Create subviews to access only the slice portions
1486+
%lhs_slice = memref.subview %lhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1487+
: memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1488+
%rhs_slice = memref.subview %rhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1489+
: memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1490+
1491+
// Combine only the slice portions
1492+
linalg.add ins(%lhs_slice, %rhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>, memref<?x?xf32, strided<[20, 1], offset: ?>>)
1493+
outs(%lhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>)
1494+
acc.yield %lhs : memref<10x20xf32>
13381495
}
13391496

13401497
// The reduction symbol is then used in the corresponding operation.
1341-
acc.parallel reduction(@reduction_add_i64 -> %a : i64) {
1498+
acc.parallel reduction(@reduction_add_memref -> %a : memref<10x20xf32>) {
13421499
}
13431500
```
13441501

mlir/test/Dialect/OpenACC/ops.mlir

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2068,3 +2068,63 @@ func.func @acc_loop_container() {
20682068
// CHECK: acc.loop
20692069
// CHECK: scf.for
20702070
// CHECK: scf.for
2071+
2072+
// -----
2073+
2074+
// Test private recipe with data bounds for array slicing
2075+
acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init {
2076+
^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty):
2077+
// NOTE: OpenACC bounds are ordered from inner-most to outer-most dimension (rank 0 = inner-most)
2078+
// MLIR memref<10x10xf32> has first dimension as outer (10) and second as inner (10)
2079+
// So bounds0 corresponds to memref's second dimension (inner), bounds1 to first dimension (outer)
2080+
2081+
// Extract bounds information for the slice
2082+
// bounds0 = inner dimension (memref dimension 1)
2083+
%lb0 = acc.get_lowerbound %bounds0 : (!acc.data_bounds_ty) -> index
2084+
%extent0 = acc.get_extent %bounds0 : (!acc.data_bounds_ty) -> index
2085+
%stride0 = acc.get_stride %bounds0 : (!acc.data_bounds_ty) -> index
2086+
2087+
// bounds1 = outer dimension (memref dimension 0)
2088+
%lb1 = acc.get_lowerbound %bounds1 : (!acc.data_bounds_ty) -> index
2089+
%extent1 = acc.get_extent %bounds1 : (!acc.data_bounds_ty) -> index
2090+
%stride1 = acc.get_stride %bounds1 : (!acc.data_bounds_ty) -> index
2091+
2092+
// Allocate memory for only the slice dimensions on the stack
2093+
// Note: memref dimensions are outer-first, so extent1 (outer) comes first, extent0 (inner) second
2094+
%slice_alloc = memref.alloca(%extent1, %extent0) : memref<?x?xf32>
2095+
2096+
// Adjust base pointer to account for the slice offset
2097+
// We need to create a view that makes the slice appear as if it starts at the original indices
2098+
%c0 = arith.constant 0 : index
2099+
%c10 = arith.constant 10 : index
2100+
%c1 = arith.constant 1 : index
2101+
2102+
// Calculate linear offset: -(lb1 * stride1 + lb0 * stride0)
2103+
// For memref<10x10xf32>, stride1=10, stride0=1
2104+
%lb1_scaled = arith.muli %lb1, %c10 : index // lb1 * 10
2105+
%lb0_scaled = arith.muli %lb0, %c1 : index // lb0 * 1
2106+
%total_offset = arith.addi %lb1_scaled, %lb0_scaled : index // lb1*10 + lb0*1
2107+
%neg_offset = arith.subi %c0, %total_offset : index // -(lb1*10 + lb0*1)
2108+
2109+
// Create a view that adjusts for the lowerbound offset
2110+
// This makes accesses like result[lb1][lb0] map to slice_alloc[0][0]
2111+
//
2112+
// Example for slice a[2:4, 3:5] where:
2113+
// - bounds0 (inner): lb0=3, extent0=2
2114+
// - bounds1 (outer): lb1=2, extent1=2
2115+
// - Allocated memory: 2x2 array (extent1 x extent0 = 2 rows x 2 cols)
2116+
// - Linear offset calculation: -(2*10 + 3*1) = -23
2117+
// - Result mapping:
2118+
// * result[2][3] -> slice_alloc[0][0] (because 2*10+3 + (-23) = 0)
2119+
// * result[2][4] -> slice_alloc[0][1] (because 2*10+4 + (-23) = 1)
2120+
// * result[3][3] -> slice_alloc[1][0] (because 3*10+3 + (-23) = 10)
2121+
// * result[3][4] -> slice_alloc[1][1] (because 3*10+4 + (-23) = 11)
2122+
%adjusted_view = memref.reinterpret_cast %slice_alloc to
2123+
offset: [%neg_offset], sizes: [10, 10], strides: [%c10, %c1]
2124+
: memref<?x?xf32> to memref<10x10xf32, strided<[?, ?], offset: ?>>
2125+
2126+
// Cast to the expected return type
2127+
%result = memref.cast %adjusted_view : memref<10x10xf32, strided<[?, ?], offset: ?>> to memref<10x10xf32>
2128+
2129+
acc.yield %result : memref<10x10xf32>
2130+
}

0 commit comments

Comments
 (0)