@@ -505,6 +505,84 @@ def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
505
505
];
506
506
}
507
507
508
+ //===----------------------------------------------------------------------===//
509
+ // DataBounds accessor operations
510
+ //===----------------------------------------------------------------------===//
511
+
512
+ def OpenACC_GetLowerboundOp : OpenACC_Op<"get_lowerbound", [NoMemoryEffect]> {
513
+ let summary = "Extract lowerbound from OpenACC data bounds.";
514
+ let description = [{
515
+ This operation extracts the lowerbound value from an `acc.bounds` value.
516
+ If the data bounds does not have a lowerbound specified, it means it is zero.
517
+
518
+ Example:
519
+ ```mlir
520
+ %lb = acc.get_lowerbound %bounds : (!acc.data_bounds_ty) -> index
521
+ ```
522
+ }];
523
+
524
+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
525
+ let results = (outs Index:$result);
526
+
527
+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
528
+ }
529
+
530
+ def OpenACC_GetUpperboundOp : OpenACC_Op<"get_upperbound", [NoMemoryEffect]> {
531
+ let summary = "Extract upperbound from OpenACC data bounds.";
532
+ let description = [{
533
+ This operation extracts the upperbound value from an `acc.bounds` value.
534
+ If the data bounds does not have an upperbound specified, this operation
535
+ uses the extent to compute it.
536
+
537
+ Example:
538
+ ```mlir
539
+ %ub = acc.get_upperbound %bounds : (!acc.data_bounds_ty) -> index
540
+ ```
541
+ }];
542
+
543
+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
544
+ let results = (outs Index:$result);
545
+
546
+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
547
+ }
548
+
549
+ def OpenACC_GetStrideOp : OpenACC_Op<"get_stride", [NoMemoryEffect]> {
550
+ let summary = "Extract stride from OpenACC data bounds.";
551
+ let description = [{
552
+ This operation extracts the stride value from an `acc.bounds` value.
553
+ If the data bounds does not have a stride specified, it defaults to 1.
554
+
555
+ Example:
556
+ ```mlir
557
+ %stride = acc.get_stride %bounds : (!acc.data_bounds_ty) -> index
558
+ ```
559
+ }];
560
+
561
+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
562
+ let results = (outs Index:$result);
563
+
564
+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
565
+ }
566
+
567
+ def OpenACC_GetExtentOp : OpenACC_Op<"get_extent", [NoMemoryEffect]> {
568
+ let summary = "Extract extent from OpenACC data bounds.";
569
+ let description = [{
570
+ This operation extracts the extent value from an `acc.bounds` value.
571
+ If the data bounds does not have an extent specified, it is computed
572
+ from the upperbound.
573
+
574
+ Example:
575
+ ```mlir
576
+ %extent = acc.get_extent %bounds : (!acc.data_bounds_ty) -> index
577
+ ```
578
+ }];
579
+
580
+ let arguments = (ins OpenACC_DataBoundsType:$bounds);
581
+ let results = (outs Index:$result);
582
+
583
+ let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
584
+ }
585
+
508
586
// Data entry operation does not refer to OpenACC spec terminology, but to
509
587
// terminology used in this dialect. It refers to data operations that will
510
588
// appear before data or compute region. It will be used as the base of acc
@@ -1180,30 +1258,48 @@ def OpenACC_PrivateRecipeOp
1180
1258
1. The initializer region specifies how to allocate and initialize a new
1181
1259
private value. For example in Fortran, a derived-type might have a
1182
1260
default initialization. The region has an argument that contains the
1183
- value that need to be privatized. This is useful if the type is not
1184
- known at compile time and the private value is needed to create its
1185
- copy.
1261
+ original value that needs to be privatized, followed by bounds arguments
1262
+ (if any) in order from innermost to outermost dimension. The region
1263
+ must yield the privatized copy.
1186
1264
2. The destroy region specifies how to destruct the value when it reaches
1187
- its end of life. It takes the privatized value as argument.
1265
+ its end of life. It takes the original value, the privatized value, and
1266
+ bounds arguments (if any) in the same order as the init region.
1188
1267
1189
1268
A single privatization recipe can be used for multiple operand if they have
1190
1269
the same type and do not require a specific default initialization.
1191
1270
1192
1271
Example:
1193
1272
1194
1273
```mlir
1195
- acc.private.recipe @privatization_f32 : f32 init {
1196
- ^bb0(%0: f32 ):
1274
+ acc.private.recipe @privatization_memref : memref<10x20xf32> init {
1275
+ ^bb0(%original: memref<10x20xf32> ):
1197
1276
// init region contains a sequence of operations to create and
1198
- // initialize the copy if needed. It yields the create copy.
1277
+ // initialize the copy. It yields the privatized copy.
1278
+ %alloca = memref.alloca() : memref<10x20xf32>
1279
+ acc.yield %alloca : memref<10x20xf32>
1280
+ } destroy {
1281
+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
1282
+ // destroy region is empty since alloca is automatically cleaned up
1283
+ acc.terminator
1284
+ }
1285
+
1286
+ // Example with bounds for array slicing:
1287
+ acc.private.recipe @privatization_slice : memref<10x20xf32> init {
1288
+ ^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1289
+ // Extract bounds and create appropriately sized allocation
1290
+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1291
+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1292
+ %slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1293
+ // ... base pointer adjustment logic ...
1294
+ acc.yield %result : memref<10x20xf32>
1199
1295
} destroy {
1200
- ^bb0(%0: f32)
1201
- // destroy region contains a sequences of operations to destruct the
1202
- // created copy.
1296
+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1297
+ // Cleanup is automatic for alloca-based allocations
1298
+ acc.terminator
1203
1299
}
1204
1300
1205
1301
// The privatization symbol is then used in the corresponding operation.
1206
- acc.parallel private(@privatization_f32 -> %a : f32 ) {
1302
+ acc.parallel private(@privatization_memref -> %a : memref<10x20xf32> ) {
1207
1303
}
1208
1304
```
1209
1305
}];
@@ -1239,38 +1335,64 @@ def OpenACC_FirstprivateRecipeOp
1239
1335
1. The initializer region specifies how to allocate and initialize a new
1240
1336
private value. For example in Fortran, a derived-type might have a
1241
1337
default initialization. The region has an argument that contains the
1242
- value that need to be privatized. This is useful if the type is not
1243
- known at compile time and the private value is needed to create its
1244
- copy.
1338
+ original value that needs to be privatized, followed by bounds arguments
1339
+ (if any) in order from innermost to outermost dimension. The region must
1340
+ yield the privatized copy.
1245
1341
2. The copy region specifies how to copy the initial value to the newly
1246
- created private value. It takes the initial value and the privatized
1247
- value as arguments.
1342
+ created private value. It takes the original value, the privatized
1343
+ value, followed by bounds arguments (if any) in the same order .
1248
1344
3. The destroy region specifies how to destruct the value when it reaches
1249
- its end of life. It takes the privatized value as argument. It is
1250
- optional.
1345
+ its end of life. It takes the original value, the privatized value, and
1346
+ bounds arguments (if any) in the same order. It is optional.
1251
1347
1252
1348
A single privatization recipe can be used for multiple operand if they have
1253
1349
the same type and do not require a specific default initialization.
1254
1350
1255
1351
Example:
1256
1352
1257
1353
```mlir
1258
- acc.firstprivate.recipe @privatization_f32 : f32 init {
1259
- ^bb0(%0: f32 ):
1354
+ acc.firstprivate.recipe @firstprivate_memref : memref<10x20xf32> init {
1355
+ ^bb0(%original: memref<10x20xf32> ):
1260
1356
// init region contains a sequence of operations to create and
1261
- // initialize the copy if needed. It yields the create copy.
1357
+ // initialize the copy. It yields the privatized copy.
1358
+ %alloca = memref.alloca() : memref<10x20xf32>
1359
+ acc.yield %alloca : memref<10x20xf32>
1262
1360
} copy {
1263
- ^bb0(%0: f32 , %1: !llvm.ptr ):
1361
+ ^bb0(%original: memref<10x20xf32> , %privatized: memref<10x20xf32> ):
1264
1362
// copy region contains a sequence of operations to copy the initial value
1265
1363
// of the firstprivate value to the newly created value.
1364
+ memref.copy %original, %privatized : memref<10x20xf32> to memref<10x20xf32>
1365
+ acc.terminator
1266
1366
} destroy {
1267
- ^bb0(%0: f32)
1268
- // destroy region contains a sequences of operations to destruct the
1269
- // created copy.
1367
+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
1368
+ // destroy region is empty since alloca is automatically cleaned up
1369
+ acc.terminator
1370
+ }
1371
+
1372
+ // Example with bounds for array slicing:
1373
+ acc.firstprivate.recipe @firstprivate_slice : memref<10x20xf32> init {
1374
+ ^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1375
+ // Extract bounds and create appropriately sized allocation
1376
+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1377
+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1378
+ %slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1379
+ // ... base pointer adjustment logic ...
1380
+ acc.yield %result : memref<10x20xf32>
1381
+ } copy {
1382
+ ^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1383
+ // Copy the slice portion from original to privatized
1384
+ %lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
1385
+ %lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
1386
+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1387
+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1388
+ %subview = memref.subview %original[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1389
+ : memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1390
+ // Copy subview to privatized...
1391
+ acc.terminator
1270
1392
}
1271
1393
1272
1394
// The privatization symbol is then used in the corresponding operation.
1273
- acc.parallel firstprivate(@privatization_f32 -> %a : f32 ) {
1395
+ acc.parallel firstprivate(@firstprivate_memref -> %a : memref<10x20xf32> ) {
1274
1396
}
1275
1397
```
1276
1398
}];
@@ -1305,40 +1427,75 @@ def OpenACC_ReductionRecipeOp
1305
1427
mandatory regions and one optional region.
1306
1428
1307
1429
1. The initializer region specifies how to initialize the local reduction
1308
- value. The region has a first argument that contains the value of the
1309
- reduction accumulator at the start of the reduction. It is expected to
1310
- `acc.yield` the new value. Extra arguments can be added to deal with
1311
- dynamic arrays .
1312
- 2. The reduction region contains a sequences of operations to combine two
1313
- values of the reduction type into one. It has at least two arguments
1314
- and it is expected to `acc.yield` the combined value. Extra arguments
1315
- can be added to deal with dynamic arrays .
1430
+ value. The region has a first argument that contains the original value
1431
+ that needs to be reduced, followed by bounds arguments (if any) in order
1432
+ from innermost to outermost dimension. It is expected to `acc.yield` the
1433
+ initialized reduction value .
1434
+ 2. The combiner region contains a sequence of operations to combine two
1435
+ values of the reduction type into one. It has the first reduction value,
1436
+ the second reduction value, followed by bounds arguments (if any) in the
1437
+ same order. It is expected to `acc.yield` the combined value .
1316
1438
3. The optional destroy region specifies how to destruct the value when it
1317
- reaches its end of life. It takes the reduction value as argument.
1439
+ reaches its end of life. It takes the original value, the reduction value,
1440
+ and bounds arguments (if any) in the same order.
1318
1441
1319
1442
Example:
1320
1443
1321
1444
```mlir
1322
- acc.reduction.recipe @reduction_add_i64 : i64 reduction_operator<add> init {
1323
- ^bb0(%0: i64 ):
1445
+ acc.reduction.recipe @reduction_add_memref : memref<10x20xf32> reduction_operator<add> init {
1446
+ ^bb0(%original: memref<10x20xf32> ):
1324
1447
// init region contains a sequence of operations to initialize the local
1325
1448
// reduction value as specified in 2.5.15
1326
- %c0 = arith.constant 0 : i64
1327
- acc.yield %c0 : i64
1449
+ %alloca = memref.alloca() : memref<10x20xf32>
1450
+ %cst = arith.constant 0.0 : f32
1451
+ linalg.fill ins(%cst : f32) outs(%alloca : memref<10x20xf32>)
1452
+ acc.yield %alloca : memref<10x20xf32>
1328
1453
} combiner {
1329
- ^bb0(%0: i64 , %1: i64)
1454
+ ^bb0(%lhs: memref<10x20xf32> , %rhs: memref<10x20xf32>):
1330
1455
// combiner region contains a sequence of operations to combine
1331
1456
// two values into one.
1332
- %2 = arith.addi %0, %1 : i64
1333
- acc.yield %2 : i64
1457
+ linalg.add ins(%lhs, %rhs : memref<10x20xf32>, memref<10x20xf32>)
1458
+ outs(%lhs : memref<10x20xf32>)
1459
+ acc.yield %lhs : memref<10x20xf32>
1334
1460
} destroy {
1335
- ^bb0(%0: i64)
1336
- // destroy region contains a sequence of operations to destruct the
1337
- // created copy.
1461
+ ^bb0(%original: memref<10x20xf32>, %reduction: memref<10x20xf32>):
1462
+ // destroy region is empty since alloca is automatically cleaned up
1463
+ acc.terminator
1464
+ }
1465
+
1466
+ // Example with bounds for array slicing:
1467
+ acc.reduction.recipe @reduction_add_slice : memref<10x20xf32> reduction_operator<add> init {
1468
+ ^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1469
+ // Extract bounds and create appropriately sized allocation
1470
+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1471
+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1472
+ %slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
1473
+ %cst = arith.constant 0.0 : f32
1474
+ linalg.fill ins(%cst : f32) outs(%slice_alloc : memref<?x?xf32>)
1475
+ // ... base pointer adjustment logic ...
1476
+ acc.yield %result : memref<10x20xf32>
1477
+ } combiner {
1478
+ ^bb0(%lhs: memref<10x20xf32>, %rhs: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
1479
+ // Extract bounds to operate only on the slice portion
1480
+ %lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
1481
+ %lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
1482
+ %extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
1483
+ %extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
1484
+
1485
+ // Create subviews to access only the slice portions
1486
+ %lhs_slice = memref.subview %lhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1487
+ : memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1488
+ %rhs_slice = memref.subview %rhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
1489
+ : memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
1490
+
1491
+ // Combine only the slice portions
1492
+ linalg.add ins(%lhs_slice, %rhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>, memref<?x?xf32, strided<[20, 1], offset: ?>>)
1493
+ outs(%lhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>)
1494
+ acc.yield %lhs : memref<10x20xf32>
1338
1495
}
1339
1496
1340
1497
// The reduction symbol is then used in the corresponding operation.
1341
- acc.parallel reduction(@reduction_add_i64 -> %a : i64 ) {
1498
+ acc.parallel reduction(@reduction_add_memref -> %a : memref<10x20xf32> ) {
1342
1499
}
1343
1500
```
1344
1501
0 commit comments