@@ -669,14 +669,14 @@ def TT_DotOp : TT_Op<"dot", [Pure,
669669// DotScaled Op
670670//
671671def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
672- AttrSizedOperandSegments,
673- DotLike,
674- TypesMatchWith<"result's type matches accumulator's type",
675- "d", "c", "$_self">]> {
672+ AttrSizedOperandSegments,
673+ DotLike,
674+ TypesMatchWith<"result's type matches accumulator's type",
675+ "d", "c", "$_self">]> {
676676 let summary = "dot_scaled";
677677
678678 let description = [{
679- $d = matrix_multiply(scale($lhs, $lhs_scale), scale($rhs , $rhs_scale)) + $c.
679+ $d = matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs , $rhs_scale)) + $c.
680680 Where scale(x, s) is a function that applies the scale per block following microscaling spec.
681681 }];
682682
@@ -687,16 +687,15 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
687687 RankedTensorOf<[TT_Float,I8]>:$lhs,
688688 RankedTensorOf<[TT_Float,I8]>:$rhs,
689689 TT_FloatTensor:$c,
690- Optional<RankedTensorOf<[I8]>>:$lhs_scale,
691- Optional<RankedTensorOf<[I8]>>:$rhs_scale,
690+ Optional<RankedTensorOf<[TT_Float, I8]>>:$lhs_scale,
691+ Optional<RankedTensorOf<[TT_Float, I8]>>:$rhs_scale,
692692 TT_ScaleDotElemTypeAttr:$lhs_type,
693693 TT_ScaleDotElemTypeAttr:$rhs_type,
694694 BoolAttr:$fastMath
695695 );
696696
697697 let results = (outs TT_FloatTensor:$d);
698698
699- // Not sure why I need to fully specify the optional group, but otherwise it complains when loading the mlir file
700699 let assemblyFormat = [{
701700 $lhs (`scale` $lhs_scale^)? `,` $rhs (`scale` $rhs_scale^)? `,` $c `lhs` `=` $lhs_type `rhs` `=` $rhs_type attr-dict
702701 `:` type($lhs) (`,` type($lhs_scale)^)? `*` type($rhs) (`,` type($rhs_scale)^)? `->` type($d)
@@ -1297,6 +1296,57 @@ def TT_ExperimentalDescriptorStoreOp : TT_Op<"experimental_descriptor_store", [
12971296 let hasVerifier = 1;
12981297}
12991298
1299+ def TT_ExperimentalDescriptorGatherOp : TT_Op<"experimental_descriptor_gather", [MemoryEffects<[MemRead<GlobalMemory>]>]> {
1300+ let summary = "gather multiple rows from a descriptor into a single tensor";
1301+ let description = [{
1302+ The `tt.experimental_desciptor_gather` op will be lowered to NVIDIA TMA
1303+ load operations on targets that support it.
1304+
1305+ `desc_ptr` is a pointer to the TMA descriptor allocated in global memory.
1306+ The descriptor block must have 1 row and the indices must be a 1D tensor.
1307+ Accordingly, the result is a 2D tensor multiple rows.
1308+
1309+ This is an escape hatch and is only there for testing/experimenting. This
1310+ op will be removed in the future.
1311+ }];
1312+
1313+ let arguments = (ins
1314+ TT_TensorDescType:$desc,
1315+ RankedTensorOf<[I32]>:$x_offsets,
1316+ I32:$y_offset
1317+ );
1318+ let results = (outs TT_Tensor:$result);
1319+
1320+ let assemblyFormat = [{
1321+ $desc `[` $x_offsets `,` $y_offset `]`
1322+ attr-dict `:` functional-type(operands, results)
1323+ }];
1324+
1325+ let hasVerifier = 1;
1326+
1327+ let extraClassDeclaration = [{
1328+ // TMA gathers have resstrictions on the minimum size of the gather result.
1329+ // This function verifies the result type.
1330+ static LogicalResult verifyResultType(Operation *op, mlir::ShapedType type);
1331+ }];
1332+ }
1333+
1334+ def TT_ExperimentalDescriptorScatterOp : TT_Op<"experimental_descriptor_scatter", [
1335+ MemoryEffects<[MemRead<GlobalMemory>, MemWrite<GlobalMemory>]>,
1336+ ]> {
1337+ let arguments = (ins
1338+ TT_TensorDescType:$desc,
1339+ RankedTensorOf<[I32]>:$x_offsets,
1340+ I32:$y_offset,
1341+ TT_Tensor:$src
1342+ );
1343+
1344+ let assemblyFormat = [{
1345+ $desc `[` $x_offsets `,` $y_offset `]` `,` $src
1346+ attr-dict `:` type(operands)
1347+ }];
1348+ }
1349+
13001350def TT_ExperimentalTensormapCreateOp: TT_Op<
13011351 "experimental_tensormap_create",
13021352 [
0 commit comments