@@ -505,6 +505,84 @@ def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
505505 ];
506506}
507507
508+ //===----------------------------------------------------------------------===//
509+ // DataBounds accessor operations
510+ //===----------------------------------------------------------------------===//
511+
512+ def OpenACC_GetLowerboundOp : OpenACC_Op<"get_lowerbound", [NoMemoryEffect]> {
513+ let summary = "Extract lowerbound from OpenACC data bounds.";
514+ let description = [{
515+ This operation extracts the lowerbound value from an `acc.bounds` value.
516+ If the data bounds does not have a lowerbound specified, it means it is zero.
517+
518+ Example:
519+ ```mlir
520+ %lb = acc.get_lowerbound %bounds : (!acc.data_bounds_ty) -> index
521+ ```
522+ }];
523+
524+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
525+ let results = (outs Index:$result);
526+
527+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
528+ }
529+
530+ def OpenACC_GetUpperboundOp : OpenACC_Op<"get_upperbound", [NoMemoryEffect]> {
531+ let summary = "Extract upperbound from OpenACC data bounds.";
532+ let description = [{
533+ This operation extracts the upperbound value from an `acc.bounds` value.
534+ If the data bounds does not have an upperbound specified, this operation
535+ uses the extent to compute it.
536+
537+ Example:
538+ ```mlir
539+ %ub = acc.get_upperbound %bounds : (!acc.data_bounds_ty) -> index
540+ ```
541+ }];
542+
543+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
544+ let results = (outs Index:$result);
545+
546+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
547+ }
548+
549+ def OpenACC_GetStrideOp : OpenACC_Op<"get_stride", [NoMemoryEffect]> {
550+ let summary = "Extract stride from OpenACC data bounds.";
551+ let description = [{
552+ This operation extracts the stride value from an `acc.bounds` value.
553+ If the data bounds does not have a stride specified, it defaults to 1.
554+
555+ Example:
556+ ```mlir
557+ %stride = acc.get_stride %bounds : (!acc.data_bounds_ty) -> index
558+ ```
559+ }];
560+
561+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
562+ let results = (outs Index:$result);
563+
564+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
565+ }
566+
567+ def OpenACC_GetExtentOp : OpenACC_Op<"get_extent", [NoMemoryEffect]> {
568+ let summary = "Extract extent from OpenACC data bounds.";
569+ let description = [{
570+ This operation extracts the extent value from an `acc.bounds` value.
571+ If the data bounds does not have an extent specified, it is computed
572+ from the upperbound.
573+
574+ Example:
575+ ```mlir
576+ %extent = acc.get_extent %bounds : (!acc.data_bounds_ty) -> index
577+ ```
578+ }];
579+
580+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
581+ let results = (outs Index:$result);
582+
583+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
584+ }
585+
508586// Data entry operation does not refer to OpenACC spec terminology, but to
509587// terminology used in this dialect. It refers to data operations that will
510588// appear before data or compute region. It will be used as the base of acc
@@ -1180,30 +1258,48 @@ def OpenACC_PrivateRecipeOp
11801258 1. The initializer region specifies how to allocate and initialize a new
11811259 private value. For example in Fortran, a derived-type might have a
11821260 default initialization. The region has an argument that contains the
1183- value that need to be privatized. This is useful if the type is not
1184- known at compile time and the private value is needed to create its
1185- copy.
1261+ original value that needs to be privatized, followed by bounds arguments
1262+ (if any) in order from innermost to outermost dimension. The region
1263+ must yield the privatized copy.
11861264 2. The destroy region specifies how to destruct the value when it reaches
1187- its end of life. It takes the privatized value as argument.
1265+ its end of life. It takes the original value, the privatized value, and
1266+ bounds arguments (if any) in the same order as the init region.
11881267
11891268 A single privatization recipe can be used for multiple operand if they have
11901269 the same type and do not require a specific default initialization.
11911270
11921271 Example:
11931272
11941273 ```mlir
1195- acc.private.recipe @privatization_f32 : f32 init {
1196- ^bb0(%0: f32 ):
1274+ acc.private.recipe @privatization_memref : memref<10x20xf32> init {
1275+ ^bb0(%original: memref<10x20xf32> ):
11971276 // init region contains a sequence of operations to create and
1198- // initialize the copy if needed. It yields the create copy.
1277+ // initialize the copy. It yields the privatized copy.
1278+ %alloca = memref.alloca() : memref<10x20xf32>
1279+ acc.yield %alloca : memref<10x20xf32>
1280+ } destroy {
1281+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
1282+ // destroy region is empty since alloca is automatically cleaned up
1283+ acc.terminator
1284+ }
1285+
1286+ // Example with bounds for array slicing:
1287+ acc.private.recipe @privatization_slice : memref<10x20xf32> init {
1288+ ^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1289+ // Extract bounds and create appropriately sized allocation
1290+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1291+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1292+ %slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1293+ // ... base pointer adjustment logic ...
1294+ acc.yield %result : memref<10x20xf32>
11991295 } destroy {
1200- ^bb0(%0: f32)
1201- // destroy region contains a sequences of operations to destruct the
1202- // created copy.
1296+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1297+ // Cleanup is automatic for alloca-based allocations
1298+ acc.terminator
12031299 }
12041300
12051301 // The privatization symbol is then used in the corresponding operation.
1206- acc.parallel private(@privatization_f32 -> %a : f32 ) {
1302+ acc.parallel private(@privatization_memref -> %a : memref<10x20xf32> ) {
12071303 }
12081304 ```
12091305 }];
@@ -1239,38 +1335,64 @@ def OpenACC_FirstprivateRecipeOp
12391335 1. The initializer region specifies how to allocate and initialize a new
12401336 private value. For example in Fortran, a derived-type might have a
12411337 default initialization. The region has an argument that contains the
1242- value that need to be privatized. This is useful if the type is not
1243- known at compile time and the private value is needed to create its
1244- copy.
1338+ original value that needs to be privatized, followed by bounds arguments
1339+ (if any) in order from innermost to outermost dimension. The region must
1340+ yield the privatized copy.
12451341 2. The copy region specifies how to copy the initial value to the newly
1246- created private value. It takes the initial value and the privatized
1247- value as arguments.
1342+ created private value. It takes the original value, the privatized
1343+ value, followed by bounds arguments (if any) in the same order .
12481344 3. The destroy region specifies how to destruct the value when it reaches
1249- its end of life. It takes the privatized value as argument. It is
1250- optional.
1345+ its end of life. It takes the original value, the privatized value, and
1346+ bounds arguments (if any) in the same order. It is optional.
12511347
12521348 A single privatization recipe can be used for multiple operand if they have
12531349 the same type and do not require a specific default initialization.
12541350
12551351 Example:
12561352
12571353 ```mlir
1258- acc.firstprivate.recipe @privatization_f32 : f32 init {
1259- ^bb0(%0: f32 ):
1354+ acc.firstprivate.recipe @firstprivate_memref : memref<10x20xf32> init {
1355+ ^bb0(%original: memref<10x20xf32> ):
12601356 // init region contains a sequence of operations to create and
1261- // initialize the copy if needed. It yields the create copy.
1357+ // initialize the copy. It yields the privatized copy.
1358+ %alloca = memref.alloca() : memref<10x20xf32>
1359+ acc.yield %alloca : memref<10x20xf32>
12621360 } copy {
1263- ^bb0(%0: f32 , %1: !llvm.ptr ):
1361+ ^bb0(%original: memref<10x20xf32> , %privatized: memref<10x20xf32> ):
12641362 // copy region contains a sequence of operations to copy the initial value
12651363 // of the firstprivate value to the newly created value.
1364+ memref.copy %original, %privatized : memref<10x20xf32> to memref<10x20xf32>
1365+ acc.terminator
12661366 } destroy {
1267- ^bb0(%0: f32)
1268- // destroy region contains a sequences of operations to destruct the
1269- // created copy.
1367+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
1368+ // destroy region is empty since alloca is automatically cleaned up
1369+ acc.terminator
1370+ }
1371+
1372+ // Example with bounds for array slicing:
1373+ acc.firstprivate.recipe @firstprivate_slice : memref<10x20xf32> init {
1374+ ^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1375+ // Extract bounds and create appropriately sized allocation
1376+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1377+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1378+ %slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1379+ // ... base pointer adjustment logic ...
1380+ acc.yield %result : memref<10x20xf32>
1381+ } copy {
1382+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1383+ // Copy the slice portion from original to privatized
1384+ %lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
1385+ %lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
1386+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1387+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1388+ %subview = memref.subview %original[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1389+ : memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1390+ // Copy subview to privatized...
1391+ acc.terminator
12701392 }
12711393
12721394 // The privatization symbol is then used in the corresponding operation.
1273- acc.parallel firstprivate(@privatization_f32 -> %a : f32 ) {
1395+ acc.parallel firstprivate(@firstprivate_memref -> %a : memref<10x20xf32> ) {
12741396 }
12751397 ```
12761398 }];
@@ -1305,40 +1427,75 @@ def OpenACC_ReductionRecipeOp
13051427 mandatory regions and one optional region.
13061428
13071429 1. The initializer region specifies how to initialize the local reduction
1308- value. The region has a first argument that contains the value of the
1309- reduction accumulator at the start of the reduction. It is expected to
1310- `acc.yield` the new value. Extra arguments can be added to deal with
1311- dynamic arrays .
1312- 2. The reduction region contains a sequences of operations to combine two
1313- values of the reduction type into one. It has at least two arguments
1314- and it is expected to `acc.yield` the combined value. Extra arguments
1315- can be added to deal with dynamic arrays .
1430+ value. The region has a first argument that contains the original value
1431+ that needs to be reduced, followed by bounds arguments (if any) in order
1432+ from innermost to outermost dimension. It is expected to `acc.yield` the
1433+ initialized reduction value .
1434+ 2. The combiner region contains a sequence of operations to combine two
1435+ values of the reduction type into one. It has the first reduction value,
1436+ the second reduction value, followed by bounds arguments (if any) in the
1437+ same order. It is expected to `acc.yield` the combined value .
13161438 3. The optional destroy region specifies how to destruct the value when it
1317- reaches its end of life. It takes the reduction value as argument.
1439+ reaches its end of life. It takes the original value, the reduction value,
1440+ and bounds arguments (if any) in the same order.
13181441
13191442 Example:
13201443
13211444 ```mlir
1322- acc.reduction.recipe @reduction_add_i64 : i64 reduction_operator<add> init {
1323- ^bb0(%0: i64 ):
1445+ acc.reduction.recipe @reduction_add_memref : memref<10x20xf32> reduction_operator<add> init {
1446+ ^bb0(%original: memref<10x20xf32> ):
13241447 // init region contains a sequence of operations to initialize the local
13251448 // reduction value as specified in 2.5.15
1326- %c0 = arith.constant 0 : i64
1327- acc.yield %c0 : i64
1449+ %alloca = memref.alloca() : memref<10x20xf32>
1450+ %cst = arith.constant 0.0 : f32
1451+ linalg.fill ins(%cst : f32) outs(%alloca : memref<10x20xf32>)
1452+ acc.yield %alloca : memref<10x20xf32>
13281453 } combiner {
1329- ^bb0(%0: i64 , %1: i64)
1454+ ^bb0(%lhs: memref<10x20xf32> , %rhs: memref<10x20xf32>):
13301455 // combiner region contains a sequence of operations to combine
13311456 // two values into one.
1332- %2 = arith.addi %0, %1 : i64
1333- acc.yield %2 : i64
1457+ linalg.add ins(%lhs, %rhs : memref<10x20xf32>, memref<10x20xf32>)
1458+ outs(%lhs : memref<10x20xf32>)
1459+ acc.yield %lhs : memref<10x20xf32>
13341460 } destroy {
1335- ^bb0(%0: i64)
1336- // destroy region contains a sequence of operations to destruct the
1337- // created copy.
1461+ ^bb0(%original: memref<10x20xf32>, %reduction: memref<10x20xf32>):
1462+ // destroy region is empty since alloca is automatically cleaned up
1463+ acc.terminator
1464+ }
1465+
1466+ // Example with bounds for array slicing:
1467+ acc.reduction.recipe @reduction_add_slice : memref<10x20xf32> reduction_operator<add> init {
1468+ ^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1469+ // Extract bounds and create appropriately sized allocation
1470+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1471+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1472+ %slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1473+ %cst = arith.constant 0.0 : f32
1474+ linalg.fill ins(%cst : f32) outs(%slice_alloc : memref<?x?xf32>)
1475+ // ... base pointer adjustment logic ...
1476+ acc.yield %result : memref<10x20xf32>
1477+ } combiner {
1478+ ^bb0(%lhs: memref<10x20xf32>, %rhs: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1479+ // Extract bounds to operate only on the slice portion
1480+ %lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
1481+ %lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
1482+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1483+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1484+
1485+ // Create subviews to access only the slice portions
1486+ %lhs_slice = memref.subview %lhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1487+ : memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1488+ %rhs_slice = memref.subview %rhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1489+ : memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1490+
1491+ // Combine only the slice portions
1492+ linalg.add ins(%lhs_slice, %rhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>, memref<?x?xf32, strided<[20, 1], offset: ?>>)
1493+ outs(%lhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>)
1494+ acc.yield %lhs : memref<10x20xf32>
13381495 }
13391496
13401497 // The reduction symbol is then used in the corresponding operation.
1341- acc.parallel reduction(@reduction_add_i64 -> %a : i64 ) {
1498+ acc.parallel reduction(@reduction_add_memref -> %a : memref<10x20xf32> ) {
13421499 }
13431500 ```
13441501
0 commit comments