lutzroeder
diff --git a/‎source/mlir-metadata.json‎
Lines changed: 63 additions & 6 deletions b/‎source/mlir-metadata.json‎
Lines changed: 63 additions & 6 deletions
@@ -42755,7 +42755,8 @@
       { "name": "signA", "type": "DefaultValuedAttr<I1Attr, 0>" },
       { "name": "signB", "type": "DefaultValuedAttr<I1Attr, 0>" },
       { "name": "reuseA", "type": "DefaultValuedAttr<I1Attr, 0>" },
-      { "name": "reuseB", "type": "DefaultValuedAttr<I1Attr, 0>" }
+      { "name": "reuseB", "type": "DefaultValuedAttr<I1Attr, 0>" },
+      { "name": "clamp", "type": "DefaultValuedAttr<I1Attr, 0>" }
     ],
     "assemblyFormat": "$a `,` $b `,` $c attr-dict `:` functional-type(operands, $res)"
   },
@@ -49944,6 +49945,26 @@
     ],
     "assemblyFormat": "$execution_scope $group_operation $value (`cluster_size``(` $cluster_size^ `)`)? attr-dict `:` type($value) (`,` type($cluster_size)^)? `->` type(results)"
   },
+  {
+    "name": "spirv.GroupNonUniformQuadSwap",
+    "summary": "Swap the Value of the invocation within the quad with another invocation\n    in the quad using Direction.",
+    "description": "Result Type must be a scalar or vector of floating-point type, integer type,\n    or Boolean type.\n\n    Execution is a Scope, but has no effect on the behavior of this instruction.\n    It must be Subgroup.\n\n    The type of Value must be the same as Result Type.\n\n    Direction is the kind of swap to perform.\n\n    Direction must be a scalar of integer type, whose Signedness operand is 0.\n\n    Direction must come from a constant instruction.\n\n    The value returned in Result is the value provided to Value by another invocation\n    in the same quad scope instance. The invocation providing this value is\n    determined according to Direction.\n\n    A Direction of 0 indicates a horizontal swap;\n    - Invocations with quad indices of 0 and 1 swap values\n    - Invocations with quad indices of 2 and 3 swap values\n    A Direction of 1 indicates a vertical swap;\n    - Invocations with quad indices of 0 and 2 swap values\n    - Invocations with quad indices of 1 and 3 swap values\n    A Direction of 2 indicates a diagonal swap;\n    - Invocations with quad indices of 0 and 3 swap values\n    - Invocations with quad indices of 1 and 2 swap values\n\n    Direction must be one of the above values.\n\n    If a tangled invocation within the quad reads Value from an invocation not part\n    of the tangled invocation within the same quad, the resulting value is undefined.\n\n    An invocation will not execute a dynamic instance of this instruction (X') until\n    all invocations in its quad have executed all dynamic instances that are program-ordered\n    before X'.\n\n    #### Example:\n\n    ```mlir\n    %0 = spirv.GroupNonUniformQuadSwap <Subgroup> %value %dir : f32, i32\n    %1 = spirv.GroupNonUniformQuadSwap <Subgroup> %value %dir : vector<4xf32>, i32\n    ```",
+    "operands": [
+      { "name": "value", "type": "AnyTypeOf<[SPIRV_ScalarOrVectorOf<SPIRV_Float>, SPIRV_ScalarOrVectorOf<SPIRV_Integer>, SPIRV_ScalarOrVectorOf<SPIRV_Bool>]>" },
+      { "name": "direction", "type": "SPIRV_SignlessOrUnsignedInt" }
+    ],
+    "results": [
+      { "name": "result", "type": "AnyTypeOf<[SPIRV_ScalarOrVectorOf<SPIRV_Float>, SPIRV_ScalarOrVectorOf<SPIRV_Integer>, SPIRV_ScalarOrVectorOf<SPIRV_Bool>]>" }
+    ],
+    "attributes": [
+      { "name": "execution_scope", "type": "SPIRV_ScopeAttr{CrossDevice|Device|Workgroup|Subgroup|Invocation|QueueFamily|ShaderCallKHR}" }
+    ],
+    "traits": [
+      { "type": "AllTypesMatch<['value', 'result']>" }
+    ],
+    "assemblyFormat": "$execution_scope $value $direction attr-dict `:` type($value) `,` type($direction)",
+    "hasCustomAssemblyFormat": true
+  },
   {
     "name": "spirv.GroupNonUniformRotateKHR",
     "summary": "Rotate values across invocations within a subgroup.",
@@ -107994,13 +108015,17 @@
   {
     "name": "ttg.global_scratch_alloc",
     "summary": "allocate a global memory buffer",
-    "description": "This operation allocates a buffer in global memory that is private to the current program.",
+    "description": "This operation allocates a buffer in global memory that is private to the current program.\n    The `backend` attribute specifies the backend to use for allocation.\n    The `default` backend is used by TritonGPU passes.\n    Downstream Triton tools and compilers can register a different backend and use a different allocation policy.",
     "results": [
       { "name": "result", "type": "TT_Ptr" }
     ],
     "attributes": [
       { "name": "nbytes", "type": "I32Attr" },
-      { "name": "alignment", "type": "I32Attr" }
+      { "name": "alignment", "type": "I32Attr" },
+      {
+        "name": "backend",
+        "type": "DefaultValuedAttr<StrAttr, \"default\">"
+      }
     ],
     "assemblyFormat": "attr-dict `:` qualified(type($result))"
   },
@@ -108034,6 +108059,23 @@
     ],
     "assemblyFormat": "$src attr-dict `:` qualified(type($src))"
   },
+  {
+    "name": "ttg.local_gather",
+    "summary": "Gather elements from shared memory along a specified axis",
+    "description": "Gather elements from a shared memory descriptor using an indices tensor along a\n    single specified axis. The output tensor has the same shape as the indices tensor.\n\n    For each output position I, the operation reads from src where the coordinate at\n    the gather axis is replaced by indices[I]:\n      result[I] = src[I[0], ..., indices[I], ..., I[n]]\n    where the axis dimension is replaced by the index value.\n\n    This matches the behavior of tt.gather but operates on shared memory descriptors.",
+    "operands": [
+      { "name": "src", "type": "TTG_MemDescType" },
+      { "name": "indices", "type": "TT_IntTensor" },
+      { "name": "token", "type": "Optional<TTG_AsyncToken>" }
+    ],
+    "results": [
+      { "name": "result", "type": "TT_Tensor" }
+    ],
+    "attributes": [
+      { "name": "axis", "type": "I32Attr" }
+    ],
+    "assemblyFormat": "$src `[` $indices `]` (`token` $token^)? attr-dict `:` qualified(type($src)) `,` type($indices) `->` type($result)"
+  },
   {
     "name": "ttg.local_load",
     "summary": "Load a buffer from local memory into a distributed tensor",
@@ -108047,6 +108089,21 @@
     ],
     "assemblyFormat": "$src (`token` $token^)? attr-dict `:` qualified(type($src)) `->` type($result)"
   },
+  {
+    "name": "ttg.local_scatter",
+    "summary": "Scatter elements to shared memory along a specified axis",
+    "description": "Scatter elements to a shared memory descriptor using an indices tensor along a\n    single specified axis. The values tensor has the same shape as the indices tensor.\n\n    For each input position I, the operation writes to dst where the coordinate at\n    the scatter axis is replaced by indices[I]:\n      dst[I[0], ..., indices[I], ..., I[n]] = values[I]\n    where the axis dimension is replaced by the index value.\n\n    This is the inverse of local_gather and writes to shared memory at runtime-computed indices.",
+    "operands": [
+      { "name": "dst", "type": "TTG_MemDescType" },
+      { "name": "values", "type": "TT_Tensor" },
+      { "name": "indices", "type": "TT_IntTensor" },
+      { "name": "token", "type": "Optional<TTG_AsyncToken>" }
+    ],
+    "attributes": [
+      { "name": "axis", "type": "I32Attr" }
+    ],
+    "assemblyFormat": "$dst `[` $indices `]` `,` $values (`token` $token^)? attr-dict `:` qualified(type($dst)) `,` type($indices) `,` type($values)"
+  },
   {
     "name": "ttg.local_store",
     "summary": "Store a distributed tensor into a buffer in local memory",
@@ -108187,9 +108244,6 @@
     "name": "ttg.warp_specialize",
     "summary": "asynchronously execute code on multiple warpgroups",
     "description": "The `ttg.warp_specialize` op represents executing different code\n    simultaneously on different warp groups. A warp group is a group of\n    power-of-2 warps, which can be a different number of warps than in the\n    enclosing region.\n\n    The \"default\" region of the op represents the code executed by the currently\n    executing warp group. This region is allowed to implicitly capture. The op\n    contains a number of \"partition\" regions that are isolated from above. They\n    must be isolated because these regions represent different layout domains,\n    as the number of warps is different.\n\n    Semantically, execution of each region starts simultaneously for each warp\n    group, and all warp groups are joined at the end of the op.\n\n    Example:\n\n    ```mlir\n    %0 = ttg.warp_specialize(%a, %b)\n    default {\n      %out = some_operation(%a) // implicit capture of `%a`\n      ttg.warp_yield %out : i32\n    }\n    partition0(%arg0: i32, %arg1: i32) num_warps(8) {\n      some_async_dispatch(%arg0, %arg1)\n      ttg.warp_return\n    }\n    partition1(%arg0: i32, %arg1: i32) num_warps(1) {\n      some_async_dispatch(%arg0, %arg1)\n      ttg.warp_return\n    } : (i32, i32) -> i32\n    ```",
-    "operands": [
-      { "name": "explicitCaptures", "type": "Variadic<AnyType>" }
-    ],
     "results": [
       { "name": "defaultPassthrough", "type": "Variadic<AnyType>" }
     ],
@@ -108209,6 +108263,9 @@
     "name": "ttg.warp_specialize.partitions",
     "summary": "container op for `ttg.warp_specialize`",
     "description": "Because MLIR requires entire operations be isolated from above, this op\n    contains the actual isolated from above regions of `ttg.warp_specialize`.",
+    "operands": [
+      { "name": "explicitCaptures", "type": "Variadic<AnyType>" }
+    ],
     "regions": [
       { "name": "partitionRegions", "type": "VariadicRegion<MinSizedRegion<1>>" }
     ]