intel
diff --git a/‎include/triton/Analysis/Utility.h
Lines changed: 19 additions & 12 deletions b/‎include/triton/Analysis/Utility.h
Lines changed: 19 additions & 12 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 3 additions & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h
Lines changed: 16 additions & 0 deletions b/‎include/triton/Tools/LayoutUtils.h
Lines changed: 16 additions & 0 deletions
@@ -172,22 +172,29 @@ class GatherLoweringHelper {
   RankedTensorType dstTy;
 };
 
-// This struct represents a decomposed layout conversion within a warp into
-// three transformations: P1 and P2 represent lane-dependent register shuffles
-// and W represents a warp shuffle. P2^-1 is returned because it represents the
-// (reg, lane) -> (reg) mapping from the perspective of the destination element.
+// This struct represents the factorization of a warp-local layout conversion
+// into three components: a register-only permutation, a lane-only permutation,
+// and a set of swaps between lane and register basis vectors. Algebraically, it
+// represents the factorization P = P_mixed \circ P_lane \circ P_reg. It is used
+// to aid in the implementation of the layout conversion using warp-shuffles.
 //
-// Nearly all layout conversions that only require data movement within a warp
-// can be implemented this way.
+// `pReg` and `pLane` are square layouts each with only one input and output
+// dimension. `mixedTranspositions` holds pairs of integers (i, j)
+// corresponding to the transposition (r_i l_j) of the i-th register basis
+// vector with the j-th lane basis vector.
 struct DecomposedWarpConversion {
-  triton::LinearLayout P1, W, P2inv;
-  triton::LinearLayout reducedP1, reducedP2inv;
+  triton::LinearLayout pReg, pLane;
+  SmallVector<std::pair<int, int>> mixedTranspositions;
 };
 
-// Given the source and destination tensor types where a layout conversion only
-// involves data movement within warps, attempt to find a decomposition for a
-// warp layout conversion.
-std::optional<DecomposedWarpConversion>
+// Produces a decomposition of a permutation describing a warp-local layout
+// conversion as described in `DecomposedWarpConversion` above.
+//
+// This function handles cases where the numbers of register and lane basis
+// vectors differ between the two layouts. This is done by padding the smaller
+// dimension(s) with zero vectors, ensuring that the layout conversion can be
+// represented as a permutation.
+DecomposedWarpConversion
 getWarpLayoutConvertDecomposition(RankedTensorType srcTy,
                                   RankedTensorType dstTy);
 
 
@@ -327,6 +327,9 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
                                          StringRef funcName, Type funcType,
                                          StringRef libname = "",
                                          StringRef libpath = "");
+
+// Multiply a square layout with 1 input and output dimension with a vector
+Value matrixVectorProd(TritonLLVMOpBuilder &b, const LinearLayout &A, Value x);
 } // namespace gpu
 
 } // namespace triton
 
@@ -148,6 +148,22 @@ LinearLayout reshapeLayout(MLIRContext *ctx, LinearLayout layout,
 // order.
 LinearLayout transposeLinearLayout(LinearLayout layout, ArrayRef<int> order);
 
+// Reorders the in and out dimensions to match another layout.
+LinearLayout reorder_like(const LinearLayout &x, const LinearLayout &y);
+
+// For two layouts, `src` and `dst`, that differ only by a permutation of
+// their basis vectors, return a permutation layout `P` which satisfies
+// `dst` \circ `P` = `src`.
+//
+// The returned layout has the following properties:
+// - The orders of the input and output dimensions of `P` match the order of the
+//   input dimensions of `src`.
+// - Prioritizes making zero (broadcasting) vectors fixed-points of the
+//   permutation. I.e., if a vector is zero in both `src` and `dst` for the same
+//   input coordinate, it maps to itself under `P`.
+LinearLayout basisPermutationLayout(const LinearLayout &src,
+                                    const LinearLayout &dst);
+
 } // namespace mlir::triton
 
 #endif // TRITON_TOOLS_LAYOUTUTILS_H