[Intel] Remove emitIndices (#3533)

whitneywhtsang · web-flow · commit 5d235616541b · 2025-02-26T03:16:56.000Z
We no longer need Intel specific `emitIndices`, as common `emitIndices`
now uses linear layout.

Signed-off-by: Whitney Tsang &lt;whitney.tsang@intel.com&gt;
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/HistogramOpToLLVM.cpp
@@ -189,8 +189,8 @@ struct HistogramOpConversion
         LLVM::getSharedMemoryBase(loc, rewriter, targetInfo, op.getOperation());
     auto dstType = op.getType();
     Attribute dstEncoding = dstType.getEncoding();
-    auto indices = ::intel::emitIndices(op.getLoc(), rewriter, targetInfo,
-                                        dstEncoding, dstType, true);
+    auto indices = emitIndices(op.getLoc(), rewriter, targetInfo, dstEncoding,
+                               dstType, true);
     SmallVector<Value> innerDimIndices;
     for (int i = 0; i < indices.size(); ++i)
       innerDimIndices.push_back(indices[i][0]);
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/LoadStoreOpToLLVM.cpp
@@ -217,8 +217,8 @@ struct LoadStoreConversionBase {
     unsigned numElems = getTotalElemsPerThread(tensorType);
 
     // Get the LLVM values for indices in block
-    auto indices = mlir::triton::intel::emitIndices(
-        loc, rewriter, targetInfo, tensorType.getEncoding(), tensorType, true);
+    auto indices = emitIndices(loc, rewriter, targetInfo,
+                               tensorType.getEncoding(), tensorType, true);
 
     auto linearize =
         [](ArrayRef<Value> A, ArrayRef<Value> B, Value init,
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/MakeRangeOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/MakeRangeOpToLLVM.cpp
@@ -24,8 +24,7 @@ struct MakeRangeOpConversion
     auto elemTy = ty.getElementType();
     assert(elemTy.isInteger(32));
     Value start = createIndexAttrConstant(rewriter, loc, elemTy, op.getStart());
-    auto idxs =
-        ::intel::emitIndices(loc, rewriter, targetInfo, layout, ty, true);
+    auto idxs = emitIndices(loc, rewriter, targetInfo, layout, ty, true);
     unsigned elems = idxs.size();
     SmallVector<Value> retVals(elems);
     // TODO: slice layout has more elements than expected.
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/PrintOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/PrintOpToLLVM.cpp
@@ -65,8 +65,8 @@ struct PrintOpConversion
       SmallVector<SmallVector<Value>> indices;
       if (auto rankedTy =
               dyn_cast<RankedTensorType>(op.getOperand(i).getType())) {
-        indices = ::intel::emitIndices(loc, rewriter, targetInfo,
-                                       rankedTy.getEncoding(), rankedTy, true);
+        indices = emitIndices(loc, rewriter, targetInfo, rankedTy.getEncoding(),
+                              rankedTy, true);
         for (int64_t dim : rankedTy.getShape()) {
           if (dim > 0) {
             dimWidths.push_back(static_cast<int>(std::ceil(std::log10(dim))));
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ReduceOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ReduceOpToLLVM.cpp
@@ -127,7 +127,7 @@ struct ReduceOpConversion
     RankedTensorType operandType = op.getInputTypes()[0];
     // Assumes offsets don't actually depend on type
     SmallVector<SmallVector<unsigned>> offsets =
-        ::intel::emitOffsetForLayout(helper.getSrcLayout(), operandType);
+        emitOffsetForLayout(helper.getSrcLayout(), operandType);
 
     // Thread X might hold the same input value in two registers.  Get the
     // indices in `offsets` that hold unique values, and only accumulate over
@@ -139,9 +139,8 @@ struct ReduceOpConversion
 
     unsigned srcElems = getTotalElemsPerThread(operandType);
     auto *combineOp = &op.getCombineOp();
-    auto srcIndices =
-        ::intel::emitIndices(op.getLoc(), rewriter, targetInfo,
-                             helper.getSrcLayout(), operandType, true);
+    auto srcIndices = emitIndices(op.getLoc(), rewriter, targetInfo,
+                                  helper.getSrcLayout(), operandType, true);
     // reduce within threads
     for (const auto &[_, i] : uniqueOffsets) {
       SmallVector<unsigned> key = offsets[i];
@@ -204,7 +203,7 @@ struct ReduceOpConversion
         auto resultLayout = cast<SliceEncodingAttr>(resultTy.getEncoding());
         unsigned resultElems = getTotalElemsPerThread(resultTy);
         SmallVector<SmallVector<unsigned>> resultOffset =
-            ::intel::emitOffsetForLayout(resultLayout, resultTy);
+            emitOffsetForLayout(resultLayout, resultTy);
         SmallVector<Value> resultVals;
         for (int j = 0; j < resultElems; j++) {
           auto key = resultOffset[j];
@@ -381,8 +380,8 @@ struct ReduceOpConversion
         // nd-tensor where n >= 1
         auto resultLayout = cast<SliceEncodingAttr>(resultTy.getEncoding());
         unsigned resultElems = getTotalElemsPerThread(resultTy);
-        auto resultIndices = ::intel::emitIndices(loc, rewriter, targetInfo,
-                                                  resultLayout, resultTy, true);
+        auto resultIndices = emitIndices(loc, rewriter, targetInfo,
+                                         resultLayout, resultTy, true);
         auto resultShape = resultTy.getShape();
         auto resultCTATile = getShapePerCTATile(resultLayout);
         assert(resultIndices.size() == resultElems);
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -411,117 +411,6 @@ emitBaseIndexForDpasLayout(Location loc, RewriterBase &rewriter,
 
 namespace mlir::triton::intel {
 
-inline SmallVector<SmallVector<unsigned>>
-emitOffsetForLayout(Attribute layout, RankedTensorType type);
-
-// -----------------------------------------------------------------------
-// Get offsets / indices for any layout
-// -----------------------------------------------------------------------
-
-inline SmallVector<Value>
-emitBaseIndexForLayoutImpl(Location loc, RewriterBase &rewriter,
-                           const TargetInfoBase &target, Attribute layout,
-                           RankedTensorType type, bool withCTAOffset) {
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  auto shape = type.getShape();
-
-  SmallVector<Value> baseIndex;
-  RewriterBase::InsertionGuard guard(rewriter);
-  SmallVector<Value> result;
-  if (auto dpasLayout = dyn_cast<DpasEncodingAttr>(layout)) {
-    result = emitBaseIndexForDpasLayout(loc, rewriter, dpasLayout, type);
-  } else if (auto sliceLayout = dyn_cast<SliceEncodingAttr>(layout)) {
-    auto parentLayout = sliceLayout.getParent();
-    auto parentShape = sliceLayout.paddedShape(type.getShape());
-    RankedTensorType parentTy =
-        RankedTensorType::get(parentShape, type.getElementType(), parentLayout);
-    result = ::intel::emitBaseIndexForLayoutImpl(
-        loc, rewriter, target, parentLayout, parentTy, withCTAOffset);
-    result.erase(result.begin() + sliceLayout.getDim());
-    // CTAOffset has been added in emitBaseIndexForLayout of parentLayout
-    return result;
-  } else if (auto dotLayout = dyn_cast<DotOperandEncodingAttr>(layout)) {
-    result = emitBaseIndexForDotOpLayout(loc, rewriter, dotLayout, type);
-  } else {
-    return mlir::emitBaseIndexForLayoutImpl(loc, rewriter, target, layout, type,
-                                            withCTAOffset);
-  }
-  if (withCTAOffset) {
-    auto CTAOffset =
-        emitCTAOffsetForLayout(loc, rewriter, target, layout, shape);
-    assert(CTAOffset.size() == result.size() && "Rank mismatch");
-    for (unsigned k = 0; k < result.size(); ++k) {
-      // Individual elements of `result` may be null.  In the caller
-      // (emitBaseIndexForLayout), we assert that all such dimensions are sliced
-      // off.
-      if (!result[k])
-        continue;
-      result[k] = b.add(result[k], CTAOffset[k]);
-    }
-  }
-  return result;
-}
-
-inline SmallVector<Value>
-emitBaseIndexForLayout(Location loc, RewriterBase &rewriter,
-                       const TargetInfoBase &target, Attribute layout,
-                       RankedTensorType type, bool withCTAOffset) {
-  SmallVector<Value> idx = ::intel::emitBaseIndexForLayoutImpl(
-      loc, rewriter, target, layout, type, withCTAOffset);
-
-  // Check that any null values were sliced out.
-  for (Value v : idx) {
-    if (!v) {
-      llvm::errs() << "Failed to generate indexing code, possibly due to bad "
-                      "#mma layout.  Please rerun your program with "
-                      "MLIR_ENABLE_DUMP=1 and file a bug."
-                   << "\nloc: " << loc << "\nlayout: " << layout
-                   << "\ntype: " << type << "\nwithCTAOffset: " << withCTAOffset
-                   << "\n";
-      llvm::report_fatal_error("Failed to generate indexing code");
-    }
-  }
-
-  return idx;
-}
-
-inline SmallVector<SmallVector<unsigned>>
-emitOffsetForLayout(Attribute layout, RankedTensorType type) {
-  return mlir::emitOffsetForLayout(layout, type);
-}
-
-// Emit indices calculation within each ConversionPattern, and returns a
-// [elemsPerThread X rank] index matrix.
-inline SmallVector<SmallVector<Value>>
-emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
-            Attribute layout, RankedTensorType type, bool withCTAOffset) {
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  MLIRContext *ctx = rewriter.getContext();
-  auto shape = type.getShape();
-  std::optional<LinearLayout> ll = triton::gpu::toLinearLayout(shape, layout);
-  if (ll.has_value())
-    return mlir::emitIndices(loc, rewriter, target, layout, type,
-                             withCTAOffset);
-
-  // step 1, delinearize threadId to get the base index
-  auto multiDimBase = ::intel::emitBaseIndexForLayout(
-      loc, rewriter, target, layout, type, withCTAOffset);
-  // step 2, get offset of each element
-  auto offset = intel::emitOffsetForLayout(layout, type);
-  // step 3, add offset to base, and reorder the sequence
-  // of indices to guarantee that elems in the same
-  // sizePerThread are adjacent in order
-  unsigned rank = shape.size();
-  unsigned elemsPerThread = offset.size();
-  SmallVector<SmallVector<Value>> multiDimIdx(elemsPerThread,
-                                              SmallVector<Value>(rank));
-  for (unsigned n = 0; n < elemsPerThread; ++n)
-    for (unsigned k = 0; k < rank; ++k)
-      multiDimIdx[n][k] = b.add(multiDimBase[k], b.i32_val(offset[n][k]));
-
-  return multiDimIdx;
-}
-
 Value convertBf16ToFp32(Location loc, ConversionPatternRewriter &rewriter,
                         Value v);
 Value convertFp32ToBf16(Location loc, ConversionPatternRewriter &rewriter,
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ViewOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ViewOpToLLVM.cpp
@@ -253,10 +253,8 @@ struct ExpandDimsOpConversion : public ConvertOpToLLVMPattern<ExpandDimsOp> {
           loc, "ExpandDimsOp only supports SliceEncodingAttr as its input");
     }
     auto resultLayout = resultTy.getEncoding();
-    auto srcOffsets =
-        mlir::triton::intel::emitOffsetForLayout(srcLayout, srcTy);
-    auto resultOffsets =
-        mlir::triton::intel::emitOffsetForLayout(resultLayout, resultTy);
+    auto srcOffsets = emitOffsetForLayout(srcLayout, srcTy);
+    auto resultOffsets = emitOffsetForLayout(resultLayout, resultTy);
     std::map<SmallVector<unsigned>, Value> srcValues;
     for (size_t i = 0; i < srcOffsets.size(); i++) {
       srcValues[srcOffsets[i]] = srcVals[i];
@@ -338,10 +336,8 @@ struct BroadcastOpConversion
     auto typeConverter = getTypeConverter();
     assert(rank == resultTy.getRank());
     auto order = triton::gpu::getOrder(srcLayout);
-    auto srcOffsets =
-        mlir::triton::intel::emitOffsetForLayout(srcLayout, srcTy);
-    auto resultOffsets =
-        mlir::triton::intel::emitOffsetForLayout(resultLayout, resultTy);
+    auto srcOffsets = emitOffsetForLayout(srcLayout, srcTy);
+    auto resultOffsets = emitOffsetForLayout(resultLayout, resultTy);
     SmallVector<Value> srcVals = unpackLLElements(loc, src, rewriter);
     std::map<SmallVector<unsigned>, Value> srcValues;
     for (size_t i = 0; i < srcOffsets.size(); i++) {