lutzroeder
diff --git a/‎source/mlir-metadata.json‎
Lines changed: 25 additions & 10 deletions b/‎source/mlir-metadata.json‎
Lines changed: 25 additions & 10 deletions
@@ -481,8 +481,8 @@
   },
   {
     "name": "acc.firstprivate_map",
-    "summary": "Used to decompose firstprivate semantics and represents the mapping of the initial value.",
-    "description": "Description of arguments:\n    - `var`: The variable to copy. Must be either `MappableType` or\n    `PointerLikeType`.\n    - `varType`: The type of the variable that is being copied. When `var` is\n    a `MappableType`, this matches the type of `var`. When `var` is a\n    `PointerLikeType`, this type holds information about the target of the\n    pointer.\n    - `varPtrPtr`: Specifies the address of the address of `var` - only used\n    when the variable copied is a field in a struct. This is important for\n    OpenACC due to implicit attach semantics on data clauses (2.6.4).\n    - `bounds`: Used when copying just slice of array or array's bounds are not\n    encoded in type. They are in rank order where rank 0 is inner-most dimension.\n    - `asyncOperands` and `asyncOperandsDeviceType`:\n    pair-wise lists of the async clause values associated with device_type's.\n    - `asyncOnly`: a list of device_type's for which async clause\n    does not specify a value (default is acc_async_noval - OpenACC 3.3 2.16.1).\n    - `dataClause`: Keeps track of the data clause the user used. This is because\n    the acc operations are decomposed. So a 'copy' clause is decomposed to both \n    `acc.copyin` and `acc.copyout` operations, but both have dataClause that\n    specifies `acc_copy` in this field.\n    - `structured`: Flag to note whether this is associated with structured region\n    (parallel, kernels, data) or unstructured (enter data, exit data). This is\n    important due to spec specifically calling out structured and dynamic reference\n    counters (2.6.7).\n    - `implicit`: Whether this is an implicitly generated operation, such as copies\n    done to satisfy \"Variables with Implicitly Determined Data Attributes\" in 2.6.2.\n    - `modifiers`: Keeps track of the data clause modifiers (eg zero, readonly, etc)\n    - `name`: Holds the name of variable as specified in user clause (including bounds).\n\n    The async values attached to the data entry operation imply that the data\n    action applies to all device types specified by the device_type clauses\n    using the activity queues on these devices as defined by the async values.",
+    "summary": "Represents the mapping of the initial value for firstprivate semantics.",
+    "description": "The `acc.firstprivate_map` operation is an intermediate representation\n    used during the decomposition of `acc.firstprivate` operations. It\n    represents the mapping of the initial value from the host to the device,\n    which is then used to initialize per-thread private copies.\n\n    This operation is distinct from `acc.copyin` because:\n    - `acc.copyin` includes present counter updates, but private variables\n      do not impact reference counters\n    - The mapped value is used to initialize private copies rather than\n      being accessed directly",
     "operands": [
       { "name": "var", "type": "OpenACC_AnyPointerOrMappableType" },
       { "name": "varPtrPtr", "type": "Optional<OpenACC_PointerLikeTypeInterface>" },
@@ -1521,6 +1521,19 @@
     ],
     "assemblyFormat": "$desc `[` $indices `]` `from` $src (`,` `barrier` `=` $barrier^)?\n    attr-dict `:` qualified(type($src)) (`,` qualified(type($barrier))^)? `->` qualified(type($desc))"
   },
+  {
+    "name": "amdg.async_tdm_scatter",
+    "summary": "Scatter data from local memory to non-contiguous global memory rows asynchronously",
+    "description": "This operation scatters data from local memory to non-contiguous rows in global\n    memory using TDM scatter mode.\n    Unlike the regular async_tdm_copy_local_to_global which copies to contiguous memory,\n    this operation uses dst_row_indices to specify which rows in global memory to write to.\n\n    The descriptor must be 2D. The dst_row_indices specify which rows in global memory\n    to write to. The element type of dst_row_indices determines the index size:\n    - I16: 16-bit indices, up to 16 rows per instruction\n    - I32: 32-bit indices, up to 8 rows per instruction\n    If more rows are needed, multiple TDM instructions will be issued.\n\n    The dst_col_offset specifies the starting column in the destination tensor for\n    all scattered rows.",
+    "operands": [
+      { "name": "desc", "type": "TT_TensorDescType" },
+      { "name": "dst_row_indices", "type": "TensorOf<[I16, I32]>" },
+      { "name": "dst_col_offset", "type": "I32" },
+      { "name": "src", "type": "TTG_MemDescType" },
+      { "name": "barrier", "type": "Optional<TTG_MemDescType>" }
+    ],
+    "assemblyFormat": "$desc `[` $dst_row_indices `,` $dst_col_offset `]` `from` $src (`,` `barrier` `=` $barrier^)?\n    attr-dict `:` qualified(type($dst_row_indices)) `,` qualified(type($src)) (`,` qualified(type($barrier))^)? `->` qualified(type($desc))"
+  },
   {
     "name": "amdg.async_tdm_wait",
     "summary": "Wait until there are less than or equal to the given number of outstanding TDM operations",
@@ -1980,7 +1993,7 @@
   {
     "name": "amdgpu.lds_barrier",
     "summary": "Barrier that includes a wait for LDS memory operations.",
-    "description": "`amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach\n    the barrier before any of them may proceed past it) and a wait for all\n    operations that affect the Local Data Store (LDS) issued from that wrokgroup\n    to complete before the workgroup may continue. Since the LDS is per-workgroup\n    memory, this barrier may be used, for example, to ensure all workitems have\n    written data to LDS before any workitem attempts to read from it.\n\n    Note that `lds_barrier` does **not** force reads to or from global memory\n    to complete before execution continues. Therefore, it should be used when\n    operations on global memory can be issued far in advance of when their results\n    are used (for example, by writing them to LDS).\n\n    WARNING: On architectures that do not support the BackOffBarrier feature,\n    (those which will implement this barrier by emitting inline assembly),\n    use of this operation will impede the usabiliity of memory watches (including\n    breakpoints set on variables) when debugging.",
+    "description": "**DEPRECATION NOTICE**: Unless you need the inline-assembly-based workaround\n    for gfx908/MI-100, you should represent this pattern with the equivalent\n\n    ```mlir\n    gpu.barrier memfence [#gpu.address_space<workgroup>]\n    ```\n\n    instead.\n\n    `amdgpu.lds_barrier` is both a barrier (all workitems in a workgroup must reach\n    the barrier before any of them may proceed past it) and a wait for all\n    operations that affect the Local Data Store (LDS) issued from that workgroup\n    to complete before the workgroup may continue. Since the LDS is per-workgroup\n    memory, this barrier may be used, for example, to ensure all workitems have\n    written data to LDS before any workitem attempts to read from it.\n\n    Note that `lds_barrier` does **not** force reads to or from global memory\n    to complete before execution continues. Therefore, it should be used when\n    operations on global memory can be issued far in advance of when their results\n    are used (for example, by writing them to LDS).\n\n    WARNING: On architectures that do not support the BackOffBarrier feature,\n    (those which will implement this barrier by emitting inline assembly),\n    use of this operation will impede the usabiliity of memory watches (including\n    breakpoints set on variables) when debugging.",
     "assemblyFormat": "attr-dict"
   },
   {
@@ -14245,8 +14258,11 @@
   {
     "name": "gpu.barrier",
     "summary": "Synchronizes all work items of a workgroup.",
-    "description": "The `barrier` op synchronizes all work items of a workgroup. It is used\n    to coordinate communication between the work items of the workgroup.\n\n    ```mlir\n    gpu.barrier\n    ```\n\n    waits until all work items in the workgroup have reached this point\n    and all memory accesses made by these work items prior to the op are\n    visible to all work items in the workgroup. Data hazards between work items\n    accessing the same memory can be avoided by synchronizing work items\n    in-between these accesses.\n\n    Either none or all work items of a workgroup need to execute this op\n    in convergence.",
-    "assemblyFormat": "attr-dict"
+    "description": "The `barrier` op synchronizes all work items of a workgroup. It is used\n    to coordinate communication between the work items of the workgroup.\n\n    ```mlir\n    gpu.barrier\n    ```\n\n    waits until all work items in the workgroup have reached the operation\n    and all memory accesses made by these work items prior to the op are\n    visible to all work items in the workgroup. Data hazards between work items\n    accessing the same memory can be avoided by synchronizing work items\n    in-between these accesses.\n\n    If the `memfence` attribute is specified, the set of memory accesses that must\n    by completed after the barrier resolves is limited to only those accesses that\n    read from or write to the specified address spaces (though accesses to other\n    address spaces may be completed as well, especially if a particular combination\n    of address spaces is not supported on a given backend). In particular,\n    specifying `memfence []` creates a barrier that is not required to affect\n    the visibility of any memory operations and is purely used for synchronizing\n    work items.\n\n    ```mlir\n    // Only workgroup address spaces accesses required to be visible.\n    gpu.barrier memfence [#gpu.address_space<workgroup>]\n    // No memory accesses required to be visible.\n    gpu.barrier memfence []\n    // All memory accesses required to be visible.\n    gpu.barrier\n    ```\n\n    Either none or all work items of a workgroup need to execute this op\n    in convergence.",
+    "attributes": [
+      { "name": "address_spaces", "type": "OptionalAttr<TypedArrayAttrBase<GPU_AddressSpaceAttr{global|workgroup|private}>>" }
+    ],
+    "assemblyFormat": "(`memfence` $address_spaces^)? attr-dict"
   },
   {
     "name": "gpu.binary",
@@ -111816,9 +111832,9 @@
   {
     "name": "ttng.async_tma_copy_global_to_local",
     "summary": "copy data based on descriptor from global memory to local memory asynchronously",
-    "description": "This operation copies data from global memory to local memory\n    asynchronously.  This is analogue to tt.load except the data are copied to\n    local memory pointed by the memory descriptor instead of a distributed\n    tensor. The data copied depends on the global memory descriptor pointed to\n    by `desc`.\n\n    The operation supports two tensor modes:\n    - TILED (default): Regular tiled tensor memory access\n      - See: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-mode\n    - IM2COL: Im2col mode for convolution-friendly access patterns\n      - In IM2COL mode, 'coord' is the coordinates in the input tensor\n        - For example, for a 4D tensor (NHWC), 'coord' is [batch_idx, channel_idx, h, w]\n      - In IM2COL mode, additional `offsets` must be provided (uint16 values)\n        - For 3D tensors (NWC): 1 offset (offset_w)\n        - For 4D tensors (NHWC): 2 offsets (offset_w, offset_h)\n        - For 5D tensors (NDHWC): 3 offsets (offset_w, offset_h, offset_d)\n        - General rule: number of offsets = coord.size() - 2\n      - See: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-mode",
+    "description": "This operation copies data from global memory to local memory\n    asynchronously.  This is analogue to tt.load except the data are copied to\n    local memory pointed by the memory descriptor instead of a distributed\n    tensor. The data copied depends on the global memory descriptor pointed to\n    by `desc`.\n\n    The tensor mode is determined by the descriptor type:\n    - tt.tensordesc: TILED mode - Regular tiled tensor memory access\n      - See: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-tiled-mode\n    - ttng.tensordesc_im2col: IM2COL mode - Im2col mode for convolution-friendly access patterns\n      - In IM2COL mode, 'coord' is the coordinates in the input tensor\n        - For example, for a 4D tensor (NHWC), 'coord' is [batch_idx, channel_idx, h, w]\n      - In IM2COL mode, additional `offsets` must be provided (uint16 values)\n        - For 3D tensors (NWC): 1 offset (offset_w)\n        - For 4D tensors (NHWC): 2 offsets (offset_w, offset_h)\n        - For 5D tensors (NDHWC): 3 offsets (offset_w, offset_h, offset_d)\n        - General rule: number of offsets = coord.size() - 2\n      - See: https://docs.nvidia.com/cuda/parallel-thread-execution/#tensor-im2col-mode",
     "operands": [
-      { "name": "desc", "type": "TT_TensorDescType" },
+      { "name": "desc", "type": "TT_AnyTensorDescType" },
       { "name": "coord", "type": "Variadic<I32>" },
       { "name": "offsets", "type": "Variadic<I16>" },
       { "name": "barrier", "type": "TTG_MemDescType" },
@@ -111829,13 +111845,12 @@
       { "name": "multicast", "type": "UnitAttr" },
       { "name": "cache", "type": "DefaultValuedAttr<TT_CacheModifierAttr{none|ca|cg|wb|cs|wt|cv}, triton::CacheModifier::NONE>" },
       { "name": "evict", "type": "DefaultValuedAttr<TT_EvictionPolicyAttr{evict_normal|evict_first|evict_last}, triton::EvictionPolicy::NORMAL>" },
-      { "name": "isVolatile", "type": "DefaultValuedAttr<BoolAttr, false>" },
-      { "name": "tensorMode", "type": "DefaultValuedAttr<TTNG_TensorModeAttr{tiled|im2col}, triton::nvidia_gpu::TensorMode::TILED>" }
+      { "name": "isVolatile", "type": "DefaultValuedAttr<BoolAttr, false>" }
     ],
     "traits": [
       { "type": "AttrSizedOperandSegments" }
     ],
-    "assemblyFormat": "$desc `[` $coord `]` (`offsets` `=` `[` $offsets^ `]`)? $result `,` $barrier `,` $pred\n    oilist(`cacheModifier` `=` $cache | `evictionPolicy` `=` $evict | `tensorMode` `=` $tensorMode)\n    attr-dict `:` qualified(type($desc)) `,` qualified(type($barrier)) `->` qualified(type($result))"
+    "assemblyFormat": "$desc `[` $coord `]` (`offsets` `=` `[` $offsets^ `]`)? $result `,` $barrier `,` $pred\n    oilist(`cacheModifier` `=` $cache | `evictionPolicy` `=` $evict)\n    attr-dict `:` qualified(type($desc)) `,` qualified(type($barrier)) `->` qualified(type($result))"
   },
   {
     "name": "ttng.async_tma_copy_local_to_global",