Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
249 changes: 203 additions & 46 deletions mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -505,6 +505,84 @@ def OpenACC_DataBoundsOp : OpenACC_Op<"bounds",
];
}

//===----------------------------------------------------------------------===//
// DataBounds accessor operations
//===----------------------------------------------------------------------===//

def OpenACC_GetLowerboundOp : OpenACC_Op<"get_lowerbound", [NoMemoryEffect]> {
let summary = "Extract lowerbound from OpenACC data bounds.";
let description = [{
This operation extracts the lowerbound value from an `acc.bounds` value.
If the data bounds does not have a lowerbound specified, it means it is zero.

Example:
```mlir
%lb = acc.get_lowerbound %bounds : (!acc.data_bounds_ty) -> index
```
}];

let arguments = (ins OpenACC_DataBoundsType:$bounds);
let results = (outs Index:$result);

let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
}

def OpenACC_GetUpperboundOp : OpenACC_Op<"get_upperbound", [NoMemoryEffect]> {
let summary = "Extract upperbound from OpenACC data bounds.";
let description = [{
This operation extracts the upperbound value from an `acc.bounds` value.
If the data bounds does not have an upperbound specified, this operation
uses the extent to compute it.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooh, awesome!

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This description is not yet implemented :) I just captured the intent. Sorry if it misled.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, no, I assumed as much. Just awesome that it is the intent, saves a lot of other work.


Example:
```mlir
%ub = acc.get_upperbound %bounds : (!acc.data_bounds_ty) -> index
```
}];

let arguments = (ins OpenACC_DataBoundsType:$bounds);
let results = (outs Index:$result);

let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
}

def OpenACC_GetStrideOp : OpenACC_Op<"get_stride", [NoMemoryEffect]> {
let summary = "Extract stride from OpenACC data bounds.";
let description = [{
This operation extracts the stride value from an `acc.bounds` value.
If the data bounds does not have a stride specified, it defaults to 1.

Example:
```mlir
%stride = acc.get_stride %bounds : (!acc.data_bounds_ty) -> index
```
}];

let arguments = (ins OpenACC_DataBoundsType:$bounds);
let results = (outs Index:$result);

let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
}

def OpenACC_GetExtentOp : OpenACC_Op<"get_extent", [NoMemoryEffect]> {
let summary = "Extract extent from OpenACC data bounds.";
let description = [{
This operation extracts the extent value from an `acc.bounds` value.
If the data bounds does not have an extent specified, it is computed
from the upperbound.

Example:
```mlir
%extent = acc.get_extent %bounds : (!acc.data_bounds_ty) -> index
```
}];

let arguments = (ins OpenACC_DataBoundsType:$bounds);
let results = (outs Index:$result);

let assemblyFormat = "$bounds attr-dict `:` `(` type($bounds) `)` `->` type($result)";
}

// Data entry operation does not refer to OpenACC spec terminology, but to
// terminology used in this dialect. It refers to data operations that will
// appear before data or compute region. It will be used as the base of acc
Expand Down Expand Up @@ -1180,30 +1258,48 @@ def OpenACC_PrivateRecipeOp
1. The initializer region specifies how to allocate and initialize a new
private value. For example in Fortran, a derived-type might have a
default initialization. The region has an argument that contains the
value that need to be privatized. This is useful if the type is not
known at compile time and the private value is needed to create its
copy.
original value that needs to be privatized, followed by bounds arguments
(if any) in order from innermost to outermost dimension. The region
must yield the privatized copy.
2. The destroy region specifies how to destruct the value when it reaches
its end of life. It takes the privatized value as argument.
its end of life. It takes the original value, the privatized value, and
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Curious what the use of the privatized version is here? Which should I be destroying, the 1st one, right?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The way I just documented this is that the original (read this as "non-private") variable is always passed in as the first argument for consistency. So it is the second one that should be destroyed. Are you OK with this?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see the first argument being 'useful' here? But I have no problem with it, I'll just have to 'note' it.

At one point I think the 'verifier' should start doing some level of failures for this signature though, which would help catch any issues I have.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See here: #156716

I've changed the clang lowering to follow this signature.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure the first argument makes sense.

Copy link
Contributor Author

@razvanlupusoru razvanlupusoru Sep 3, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I added it for consistency so all regions take it as first argument. In all other regions including init, copy, combiner it is taken as first argument.

Although I am not sure it is useful for CIR and FIR, I can imagine cases where it can be. One scenario is when a dialect's type system does not encode sizes and it needs to be recovered from IR so that destroy can properly deallocate. I am definitely grasping at straws here - so hopefully the consistency reason is enough to justify it.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No strong opinion on that so I'll leave it to you.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At one point I think the 'verifier' should start doing some level of failures for this signature though, which would help catch any issues I have.

I agree. I have not added verifier yet because Flang does not yet comply with this.

bounds arguments (if any) in the same order as the init region.

A single privatization recipe can be used for multiple operand if they have
the same type and do not require a specific default initialization.

Example:

```mlir
acc.private.recipe @privatization_f32 : f32 init {
^bb0(%0: f32):
acc.private.recipe @privatization_memref : memref<10x20xf32> init {
^bb0(%original: memref<10x20xf32>):
// init region contains a sequence of operations to create and
// initialize the copy if needed. It yields the create copy.
// initialize the copy. It yields the privatized copy.
%alloca = memref.alloca() : memref<10x20xf32>
acc.yield %alloca : memref<10x20xf32>
} destroy {
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
// destroy region is empty since alloca is automatically cleaned up
acc.terminator
}

// Example with bounds for array slicing:
acc.private.recipe @privatization_slice : memref<10x20xf32> init {
^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
// Extract bounds and create appropriately sized allocation
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
%slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
// ... base pointer adjustment logic ...
acc.yield %result : memref<10x20xf32>
} destroy {
^bb0(%0: f32)
// destroy region contains a sequences of operations to destruct the
// created copy.
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
// Cleanup is automatic for alloca-based allocations
acc.terminator
}

// The privatization symbol is then used in the corresponding operation.
acc.parallel private(@privatization_f32 -> %a : f32) {
acc.parallel private(@privatization_memref -> %a : memref<10x20xf32>) {
}
```
}];
Expand Down Expand Up @@ -1239,38 +1335,64 @@ def OpenACC_FirstprivateRecipeOp
1. The initializer region specifies how to allocate and initialize a new
private value. For example in Fortran, a derived-type might have a
default initialization. The region has an argument that contains the
value that need to be privatized. This is useful if the type is not
known at compile time and the private value is needed to create its
copy.
original value that needs to be privatized, followed by bounds arguments
(if any) in order from innermost to outermost dimension. The region must
yield the privatized copy.
2. The copy region specifies how to copy the initial value to the newly
created private value. It takes the initial value and the privatized
value as arguments.
created private value. It takes the original value, the privatized
value, followed by bounds arguments (if any) in the same order.
3. The destroy region specifies how to destruct the value when it reaches
its end of life. It takes the privatized value as argument. It is
optional.
its end of life. It takes the original value, the privatized value, and
bounds arguments (if any) in the same order. It is optional.

A single privatization recipe can be used for multiple operand if they have
the same type and do not require a specific default initialization.

Example:

```mlir
acc.firstprivate.recipe @privatization_f32 : f32 init {
^bb0(%0: f32):
acc.firstprivate.recipe @firstprivate_memref : memref<10x20xf32> init {
^bb0(%original: memref<10x20xf32>):
// init region contains a sequence of operations to create and
// initialize the copy if needed. It yields the create copy.
// initialize the copy. It yields the privatized copy.
%alloca = memref.alloca() : memref<10x20xf32>
acc.yield %alloca : memref<10x20xf32>
} copy {
^bb0(%0: f32, %1: !llvm.ptr):
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
// copy region contains a sequence of operations to copy the initial value
// of the firstprivate value to the newly created value.
memref.copy %original, %privatized : memref<10x20xf32> to memref<10x20xf32>
acc.terminator
} destroy {
^bb0(%0: f32)
// destroy region contains a sequences of operations to destruct the
// created copy.
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>):
// destroy region is empty since alloca is automatically cleaned up
acc.terminator
}

// Example with bounds for array slicing:
acc.firstprivate.recipe @firstprivate_slice : memref<10x20xf32> init {
^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
// Extract bounds and create appropriately sized allocation
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
%slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
// ... base pointer adjustment logic ...
acc.yield %result : memref<10x20xf32>
} copy {
^bb0(%original: memref<10x20xf32>, %privatized: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
// Copy the slice portion from original to privatized
%lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
%lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
%subview = memref.subview %original[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
: memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
// Copy subview to privatized...
acc.terminator
}

// The privatization symbol is then used in the corresponding operation.
acc.parallel firstprivate(@privatization_f32 -> %a : f32) {
acc.parallel firstprivate(@firstprivate_memref -> %a : memref<10x20xf32>) {
}
```
}];
Expand Down Expand Up @@ -1305,40 +1427,75 @@ def OpenACC_ReductionRecipeOp
mandatory regions and one optional region.

1. The initializer region specifies how to initialize the local reduction
value. The region has a first argument that contains the value of the
reduction accumulator at the start of the reduction. It is expected to
`acc.yield` the new value. Extra arguments can be added to deal with
dynamic arrays.
2. The reduction region contains a sequences of operations to combine two
values of the reduction type into one. It has at least two arguments
and it is expected to `acc.yield` the combined value. Extra arguments
can be added to deal with dynamic arrays.
value. The region has a first argument that contains the original value
that needs to be reduced, followed by bounds arguments (if any) in order
from innermost to outermost dimension. It is expected to `acc.yield` the
initialized reduction value.
2. The combiner region contains a sequence of operations to combine two
values of the reduction type into one. It has the first reduction value,
the second reduction value, followed by bounds arguments (if any) in the
same order. It is expected to `acc.yield` the combined value.
3. The optional destroy region specifies how to destruct the value when it
reaches its end of life. It takes the reduction value as argument.
reaches its end of life. It takes the original value, the reduction value,
and bounds arguments (if any) in the same order.

Example:

```mlir
acc.reduction.recipe @reduction_add_i64 : i64 reduction_operator<add> init {
^bb0(%0: i64):
acc.reduction.recipe @reduction_add_memref : memref<10x20xf32> reduction_operator<add> init {
^bb0(%original: memref<10x20xf32>):
// init region contains a sequence of operations to initialize the local
// reduction value as specified in 2.5.15
%c0 = arith.constant 0 : i64
acc.yield %c0 : i64
%alloca = memref.alloca() : memref<10x20xf32>
%cst = arith.constant 0.0 : f32
linalg.fill ins(%cst : f32) outs(%alloca : memref<10x20xf32>)
acc.yield %alloca : memref<10x20xf32>
} combiner {
^bb0(%0: i64, %1: i64)
^bb0(%lhs: memref<10x20xf32>, %rhs: memref<10x20xf32>):
// combiner region contains a sequence of operations to combine
// two values into one.
%2 = arith.addi %0, %1 : i64
acc.yield %2 : i64
linalg.add ins(%lhs, %rhs : memref<10x20xf32>, memref<10x20xf32>)
outs(%lhs : memref<10x20xf32>)
acc.yield %lhs : memref<10x20xf32>
} destroy {
^bb0(%0: i64)
// destroy region contains a sequence of operations to destruct the
// created copy.
^bb0(%original: memref<10x20xf32>, %reduction: memref<10x20xf32>):
// destroy region is empty since alloca is automatically cleaned up
acc.terminator
}

// Example with bounds for array slicing:
acc.reduction.recipe @reduction_add_slice : memref<10x20xf32> reduction_operator<add> init {
^bb0(%original: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
// Extract bounds and create appropriately sized allocation
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index
%slice_alloc = memref.alloca(%extent_outer, %extent_inner) : memref<?x?xf32>
%cst = arith.constant 0.0 : f32
linalg.fill ins(%cst : f32) outs(%slice_alloc : memref<?x?xf32>)
// ... base pointer adjustment logic ...
acc.yield %result : memref<10x20xf32>
} combiner {
^bb0(%lhs: memref<10x20xf32>, %rhs: memref<10x20xf32>, %bounds_inner: !acc.data_bounds_ty, %bounds_outer: !acc.data_bounds_ty):
// Extract bounds to operate only on the slice portion
%lb_inner = acc.get_lowerbound %bounds_inner : (!acc.data_bounds_ty) -> index
%lb_outer = acc.get_lowerbound %bounds_outer : (!acc.data_bounds_ty) -> index
%extent_inner = acc.get_extent %bounds_inner : (!acc.data_bounds_ty) -> index
%extent_outer = acc.get_extent %bounds_outer : (!acc.data_bounds_ty) -> index

// Create subviews to access only the slice portions
%lhs_slice = memref.subview %lhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
: memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>
%rhs_slice = memref.subview %rhs[%lb_outer, %lb_inner][%extent_outer, %extent_inner][1, 1]
: memref<10x20xf32> to memref<?x?xf32, strided<[20, 1], offset: ?>>

// Combine only the slice portions
linalg.add ins(%lhs_slice, %rhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>, memref<?x?xf32, strided<[20, 1], offset: ?>>)
outs(%lhs_slice : memref<?x?xf32, strided<[20, 1], offset: ?>>)
acc.yield %lhs : memref<10x20xf32>
}

// The reduction symbol is then used in the corresponding operation.
acc.parallel reduction(@reduction_add_i64 -> %a : i64) {
acc.parallel reduction(@reduction_add_memref -> %a : memref<10x20xf32>) {
}
```

Expand Down
60 changes: 60 additions & 0 deletions mlir/test/Dialect/OpenACC/ops.mlir
Original file line number Diff line number Diff line change
Expand Up @@ -2068,3 +2068,63 @@ func.func @acc_loop_container() {
// CHECK: acc.loop
// CHECK: scf.for
// CHECK: scf.for

// -----

// Test private recipe with data bounds for array slicing
acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init {
^bb0(%arg0: memref<10x10xf32>, %bounds0: !acc.data_bounds_ty, %bounds1: !acc.data_bounds_ty):
// NOTE: OpenACC bounds are ordered from inner-most to outer-most dimension (rank 0 = inner-most)
// MLIR memref<10x10xf32> has first dimension as outer (10) and second as inner (10)
// So bounds0 corresponds to memref's second dimension (inner), bounds1 to first dimension (outer)

// Extract bounds information for the slice
// bounds0 = inner dimension (memref dimension 1)
%lb0 = acc.get_lowerbound %bounds0 : (!acc.data_bounds_ty) -> index
%extent0 = acc.get_extent %bounds0 : (!acc.data_bounds_ty) -> index
%stride0 = acc.get_stride %bounds0 : (!acc.data_bounds_ty) -> index

// bounds1 = outer dimension (memref dimension 0)
%lb1 = acc.get_lowerbound %bounds1 : (!acc.data_bounds_ty) -> index
%extent1 = acc.get_extent %bounds1 : (!acc.data_bounds_ty) -> index
%stride1 = acc.get_stride %bounds1 : (!acc.data_bounds_ty) -> index

// Allocate memory for only the slice dimensions on the stack
// Note: memref dimensions are outer-first, so extent1 (outer) comes first, extent0 (inner) second
%slice_alloc = memref.alloca(%extent1, %extent0) : memref<?x?xf32>

// Adjust base pointer to account for the slice offset
// We need to create a view that makes the slice appear as if it starts at the original indices
%c0 = arith.constant 0 : index
%c10 = arith.constant 10 : index
%c1 = arith.constant 1 : index

// Calculate linear offset: -(lb1 * stride1 + lb0 * stride0)
// For memref<10x10xf32>, stride1=10, stride0=1
%lb1_scaled = arith.muli %lb1, %c10 : index // lb1 * 10
%lb0_scaled = arith.muli %lb0, %c1 : index // lb0 * 1
%total_offset = arith.addi %lb1_scaled, %lb0_scaled : index // lb1*10 + lb0*1
%neg_offset = arith.subi %c0, %total_offset : index // -(lb1*10 + lb0*1)

// Create a view that adjusts for the lowerbound offset
// This makes accesses like result[lb1][lb0] map to slice_alloc[0][0]
//
// Example for slice a[2:4, 3:5] where:
// - bounds0 (inner): lb0=3, extent0=2
// - bounds1 (outer): lb1=2, extent1=2
// - Allocated memory: 2x2 array (extent1 x extent0 = 2 rows x 2 cols)
// - Linear offset calculation: -(2*10 + 3*1) = -23
// - Result mapping:
// * result[2][3] -> slice_alloc[0][0] (because 2*10+3 + (-23) = 0)
// * result[2][4] -> slice_alloc[0][1] (because 2*10+4 + (-23) = 1)
// * result[3][3] -> slice_alloc[1][0] (because 3*10+3 + (-23) = 10)
// * result[3][4] -> slice_alloc[1][1] (because 3*10+4 + (-23) = 11)
%adjusted_view = memref.reinterpret_cast %slice_alloc to
offset: [%neg_offset], sizes: [10, 10], strides: [%c10, %c1]
: memref<?x?xf32> to memref<10x10xf32, strided<[?, ?], offset: ?>>

// Cast to the expected return type
%result = memref.cast %adjusted_view : memref<10x10xf32, strided<[?, ?], offset: ?>> to memref<10x10xf32>

acc.yield %result : memref<10x10xf32>
}
Loading