intel
diff --git a/‎test/Conversion/intel/intel-allocate-shared-memory.mlir‎
Lines changed: 65 additions & 0 deletions b/‎test/Conversion/intel/intel-allocate-shared-memory.mlir‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎test/Conversion/intel/sub-group-shuffle.mlir‎
Lines changed: 6 additions & 6 deletions b/‎test/Conversion/intel/sub-group-shuffle.mlir‎
Lines changed: 6 additions & 6 deletions
diff --git a/‎test/Conversion/intel/sub-group-transpose.mlir‎
Lines changed: 17 additions & 0 deletions b/‎test/Conversion/intel/sub-group-transpose.mlir‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎third_party/intel/include/Analysis/Membar.h‎
Lines changed: 17 additions & 0 deletions b/‎third_party/intel/include/Analysis/Membar.h‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎third_party/intel/lib/Analysis/Allocation.cpp‎
Lines changed: 28 additions & 6 deletions b/‎third_party/intel/lib/Analysis/Allocation.cpp‎
Lines changed: 28 additions & 6 deletions
diff --git a/‎third_party/intel/lib/Analysis/AxisInfo.cpp‎
Lines changed: 24 additions & 14 deletions b/‎third_party/intel/lib/Analysis/AxisInfo.cpp‎
Lines changed: 24 additions & 14 deletions
diff --git a/‎third_party/intel/lib/Analysis/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎third_party/intel/lib/Analysis/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -0,0 +1,65 @@
+// RUN: triton-opt %s -split-input-file --intel-allocate-shared-memory | FileCheck %s
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check no scratch memory is allocated for sub-group shuffle-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 0 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK: tt.func @test_sub_group_shuffle
+  // CHECK-NOT: llvm.ptr<3>
+  tt.func @test_sub_group_shuffle(%arg0: tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>>) -> tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked}>> -> tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+    tt.return %0 : tensor<16xf16, #triton_gpu.slice<{dim = 1, parent = #blocked1}>>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 512 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f16(%arg0: tensor<16x16xf16, #blocked>) -> tensor<16x16xf16, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf16, #blocked> -> tensor<16x16xf16, #blocked1>
+    tt.return %0 : tensor<16x16xf16, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [1, 1], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [1, 1], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 1024 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<16x16xf32, #blocked>) -> tensor<16x16xf32, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<16x16xf32, #blocked> -> tensor<16x16xf32, #blocked1>
+    tt.return %0 : tensor<16x16xf32, #blocked1>
+  }
+}
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [4, 2], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [4, 2], order = [0, 1]}>
+
+// Check scracth memory configuration for different sub-group transpose-like layout conversions.
+
+// CHECK-LABEL: module attributes
+// CHECK-SAME: triton_gpu.shared = 32768 : i32
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 8 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  tt.func @test_f32(%arg0: tensor<128x64xf32, #blocked>) -> tensor<128x64xf32, #blocked1> {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<128x64xf32, #blocked> -> tensor<128x64xf32, #blocked1>
+    tt.return %0 : tensor<128x64xf32, #blocked1>
+  }
+}
@@ -9,7 +9,7 @@
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f16(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>,
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(f16)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f16)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shuffleDhj(%[[VAL_2]], %[[VAL_4]])
@@ -49,7 +49,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_bf16(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>,
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(bf16)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(bf16)>
   // CHECK:           %[[VAL_2:.*]] = llvm.bitcast %[[VAL_1]] : bf16 to i16
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -91,7 +91,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_i1(
-  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>,
+  // CHECK-SAME:                                     %[[VAL_0:.*]]: !llvm.struct<(i1)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(i1)>
   // CHECK:           %[[VAL_2:.*]] = llvm.zext %[[VAL_1]] : i1 to i8
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -133,7 +133,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
   }
 
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_ptr(
-  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>,
+  // CHECK-SAME:                                      %[[VAL_0:.*]]: !llvm.struct<(ptr<1>)>)
   // CHECK:           %[[VAL_1:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(ptr<1>)>
   // CHECK:           %[[VAL_2:.*]] = llvm.ptrtoint %[[VAL_1]] : !llvm.ptr<1> to i64
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
@@ -186,7 +186,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 32 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_f32(
-  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>,
+  // CHECK-SAME:                                       %[[VAL_0:.*]]: !llvm.struct<(f32)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f32)>
   // CHECK:           %[[VAL_4:.*]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK:           llvm.call spir_funccc @_Z17sub_group_shufflefj(%[[VAL_2]], %[[VAL_4]])
@@ -269,7 +269,7 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
   // CHECK-LABEL:   llvm.func spir_kernelcc @test_non_sliced_multi_register(
-  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>,
+  // CHECK-SAME:                                                            %[[VAL_0:.*]]: !llvm.struct<(f64, f64)>)
   // CHECK:           %[[VAL_2:.*]] = llvm.extractvalue %[[VAL_0]][0] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_3:.*]] = llvm.extractvalue %[[VAL_0]][1] : !llvm.struct<(f64, f64)>
   // CHECK:           %[[VAL_5:.*]] = llvm.mlir.constant(0 : i32) : i32
 
@@ -426,3 +426,20 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 :
     tt.return %0 : tensor<32x64xf32, #blocked1>
   }
 }
+
+// -----
+
+// Test no barriers are inserted when back to back transpositions are performed.
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [16, 1], threadsPerWarp = [1, 16], warpsPerCTA = [2, 2], order = [0, 1]}>
+#blocked1 = #triton_gpu.blocked<{sizePerThread = [1, 16], threadsPerWarp = [16, 1], warpsPerCTA = [2, 2], order = [0, 1]}>
+
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 4 : i32, "triton_gpu.threads-per-warp" = 16 : i32} {
+  // CHECK-LABEL: llvm.func spir_kernelcc @test_back_to_back
+  // CHECK-NOT: barrier
+  tt.func @test_back_to_back(%arg0: tensor<32x64xf32, #blocked>, %arg1: tensor<32x64xf32, #blocked>) -> (tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>) {
+    %0 = triton_gpu.convert_layout %arg0 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
+    %1 = triton_gpu.convert_layout %arg1 : tensor<32x64xf32, #blocked> -> tensor<32x64xf32, #blocked1>
+    tt.return %0, %1 : tensor<32x64xf32, #blocked1>, tensor<32x64xf32, #blocked1>
+  }
+}
@@ -0,0 +1,17 @@
+#ifndef TRITON_INTEL_ANALYSIS_MEMBAR_H
+#define TRITON_INTEL_ANALYSIS_MEMBAR_H
+
+namespace mlir {
+class Operation;
+namespace intel {
+/// Intel-specific callback to filter operations that need no barriers between
+/// each other.
+///
+/// This is useful as the granularity to check whether barriers are needed is
+/// quite coarse. The filter will return true if no barrier is needed between
+/// `lhsOp` and `rhsOp`.
+bool membarFilter(Operation *lhsOp, Operation *rhsOp);
+} // namespace intel
+} // namespace mlir
+
+#endif // TRITON_INTEL_ANALYSIS_MEMBAR_H
@@ -15,6 +15,8 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "llvm/ADT/SmallVector.h"
 
+#include "intel/include/Analysis/Utility.h"
+
 using ::mlir::triton::gpu::AMDMfmaEncodingAttr;
 using ::mlir::triton::gpu::BlockedEncodingAttr;
 using ::mlir::triton::gpu::DotOperandEncodingAttr;
@@ -51,10 +53,10 @@ getCvtOrder(Attribute srcLayout, Attribute dstLayout) {
 
   // mma or dot layout does not have an order, so the order depends on the
   // layout of the other operand.
-  auto inOrd = (srcMmaLayout || srcDotLayout) ? getOrder(dstLayout)
-                                              : getOrder(srcLayout);
-  auto outOrd = (dstMmaLayout || dstDotLayout) ? getOrder(srcLayout)
-                                               : getOrder(dstLayout);
+  const auto &inOrd = (srcMmaLayout || srcDotLayout) ? getOrder(dstLayout)
+                                                     : getOrder(srcLayout);
+  const auto &outOrd = (dstMmaLayout || dstDotLayout) ? getOrder(srcLayout)
+                                                      : getOrder(dstLayout);
 
   return {inOrd, outOrd};
 }
@@ -104,6 +106,26 @@ static SmallVector<unsigned> getRepShapeForAtomic(Value result) {
 
 ScratchConfig getScratchConfigForCvt(RankedTensorType srcTy,
                                      RankedTensorType dstTy) {
+  if (gpu::intel::cvtIsSubGroupShuffle(srcTy, dstTy)) {
+    // Conversions that can be implemented as sub-group shuffles do not need
+    // scratch memory.
+    return ScratchConfig({}, {});
+  }
+
+  if (gpu::intel::cvtIsSubGroupTranspose(srcTy, dstTy)) {
+    // Conversions that can be implemented as sub-group transposes store the
+    // whole tensor in shared memory and read it afterwards.
+    auto srcEncoding = cast<gpu::DistributedEncodingTrait>(srcTy.getEncoding());
+    unsigned threadsPerWarp = product(srcEncoding.getThreadsPerWarp());
+    unsigned warpsPerCTA = product(srcEncoding.getWarpsPerCTA());
+    unsigned remaining = product(srcTy.getShape()) /
+                         (threadsPerWarp * threadsPerWarp * warpsPerCTA);
+    SmallVector<unsigned> repShape{threadsPerWarp, threadsPerWarp, remaining,
+                                   warpsPerCTA};
+    return ScratchConfig(repShape, repShape,
+                         /*inVec=*/1, /*outVec=*/threadsPerWarp);
+  }
+
   // Initialize vector sizes and stride
   auto repShape = getRepShapeForCvt(srcTy, dstTy);
   if (repShape.empty())
@@ -346,7 +368,7 @@ class AllocationAnalysis {
   /// arguments are involved.
   void resolveAliasBufferLiveness(
       function_ref<Interval<size_t>(Value value)> getLiveness) {
-    for (auto aliasBufferIter : allocation->getAliasBuffer()) {
+    for (const auto &aliasBufferIter : allocation->getAliasBuffer()) {
       auto value = aliasBufferIter.first;
       auto buffers = aliasBufferIter.second;
       auto range = getLiveness(value);
@@ -486,7 +508,7 @@ class AllocationAnalysis {
           std::find_if(xBuffers.begin(), xBuffers.end(), [&](auto *buffer) {
             auto xRange = bufferRange[buffer];
             bool res = xRange.intersects(range);
-            for (auto val : tripleMap)
+            for (const auto &val : tripleMap)
               res = res &&
                     !val.second.intersects(xRange); // only one buffer intersect
             return res;
 
@@ -123,7 +123,8 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {
         divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d));
       }
     }
-    return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy), constantValue);
   }
 
 protected:
@@ -543,7 +544,8 @@ class SplatOpAxisInfoVisitor final
       divisibility.push_back(opInfo.getDivisibility(0));
       constancy.push_back(retTy.getShape()[d]);
     }
-    return AxisInfo(contiguity, divisibility, constancy,
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -574,7 +576,8 @@ class LoadOpAxisInfoVisitor final : public AxisInfoVisitorImpl<triton::LoadOp> {
               maskInfo.has_value() ? maskInfo->getConstancy(d) : 0));
     }
 
-    return AxisInfo(contiguity, divisibility, constancy);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy));
   }
 };
 
@@ -608,7 +611,8 @@ class ExpandDimsOpAxisInfoVisitor final
     contiguity.insert(contiguity.begin() + op.getAxis(), 1);
     divisibility.insert(divisibility.begin() + op.getAxis(), newDivisibility);
     constancy.insert(constancy.begin() + op.getAxis(), 1);
-    return AxisInfo(contiguity, divisibility, constancy,
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -637,7 +641,8 @@ class BroadcastOpAxisInfoVisitor final
       constancy.push_back(opShape[d] == 1 ? retShape[d]
                                           : opInfo.getConstancy(d));
     }
-    return AxisInfo(contiguity, divisibility, constancy,
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy),
                     operands[0]->getValue().getConstantValue());
   }
 };
@@ -712,7 +717,8 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
       contiguity.push_back(1);
     }
 
-    return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy), constantValue);
   }
 
 private:
@@ -840,7 +846,8 @@ class SelectOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
         constantValue = lhsInfo.getConstantValue();
     }
 
-    return AxisInfo(contiguity, divisibility, constancy, constantValue);
+    return AxisInfo(std::move(contiguity), std::move(divisibility),
+                    std::move(constancy), constantValue);
   }
 };
 
@@ -993,7 +1000,8 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {
         contiguity.push_back(
             std::min(lhsInfo.getContiguity(d), rhsInfo.getContiguity(d)));
       }
-      return AxisInfo(contiguity, divisibility, constancy, std::nullopt);
+      return AxisInfo(std::move(contiguity), std::move(divisibility),
+                      std::move(constancy), std::nullopt);
     }
   }
 };
@@ -1038,7 +1046,8 @@ class MakeTensorPtrOpAxisInfoVisitor final
       constancy.push_back(1);
     }
 
-    auto axisInfo = AxisInfo(contiguity, divisibility, constancy);
+    auto axisInfo = AxisInfo(std::move(contiguity), std::move(divisibility),
+                             std::move(constancy));
 
     LLVM_DEBUG({
       std::string axisStr;
@@ -1143,8 +1152,8 @@ LogicalResult AxisInfoAnalysis::visitOperation(
     auto vals = cast<DenseElementsAttr>(attr).getValues<int>();
     newConstancy = AxisInfo::DimVectorT(vals.begin(), vals.end());
   }
-  curr = AxisInfo(newContiguity, newDivisibility, newConstancy,
-                  curr.getConstantValue());
+  curr = AxisInfo(std::move(newContiguity), std::move(newDivisibility),
+                  std::move(newConstancy), curr.getConstantValue());
   // join all lattice elements
   for (auto *result : results)
     propagateIfChanged(result, result->join(curr));
@@ -1154,17 +1163,18 @@ LogicalResult AxisInfoAnalysis::visitOperation(
 void AxisInfoAnalysis::visitForOpInductionVar(
     scf::ForOp op, ArrayRef<dataflow::Lattice<AxisInfo> *> argLattices) {
   ProgramPoint programPoint(op);
-  const auto lb =
+  const auto &lb =
       getLatticeElementFor(&programPoint, op.getLowerBound())->getValue();
-  const auto step =
+  const auto &step =
       getLatticeElementFor(&programPoint, op.getStep())->getValue();
 
   AxisInfo::DimVectorT knownContiguity(1, 1);
   AxisInfo::DimVectorT knownDivisibility(1, 1);
   AxisInfo::DimVectorT knownConstancy(1, 1);
   knownDivisibility[0] = gcd(lb.getDivisibility(0), step.getDivisibility(0));
   auto inductionVar =
-      AxisInfo(knownContiguity, knownDivisibility, knownConstancy);
+      AxisInfo(std::move(knownContiguity), std::move(knownDivisibility),
+               std::move(knownConstancy));
   (void)argLattices[0]->join(inductionVar);
 }
 
 
@@ -3,6 +3,7 @@ add_triton_library(TritonIntelAnalysis
     AxisInfo.cpp
     DPAS.cpp
     Liveness.cpp
+    Membar.cpp
     Utility.cpp
 
     DEPENDS
Original file line number	Diff line number	Diff line change
`@@ -123,7 +123,8 @@ class BinaryOpVisitorImpl : public AxisInfoVisitorImpl<OpTy> {`
`123`	`123`	`divisibility.push_back(getDivisibility(op, lhsInfo, rhsInfo, d));`
`124`	`124`	`}`
`125`	`125`	`}`
`126`		`- return AxisInfo(contiguity, divisibility, constancy, constantValue);`
	`126`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`127`	`+ std::move(constancy), constantValue);`
`127`	`128`	`}`
`128`	`129`
`129`	`130`	`protected:`
`@@ -543,7 +544,8 @@ class SplatOpAxisInfoVisitor final`
`543`	`544`	`divisibility.push_back(opInfo.getDivisibility(0));`
`544`	`545`	`constancy.push_back(retTy.getShape()[d]);`
`545`	`546`	`}`
`546`		`- return AxisInfo(contiguity, divisibility, constancy,`
	`547`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`548`	`+ std::move(constancy),`
`547`	`549`	`operands[0]->getValue().getConstantValue());`
`548`	`550`	`}`
`549`	`551`	`};`
`@@ -574,7 +576,8 @@ class LoadOpAxisInfoVisitor final : public AxisInfoVisitorImpl<triton::LoadOp> {`
`574`	`576`	`maskInfo.has_value() ? maskInfo->getConstancy(d) : 0));`
`575`	`577`	`}`
`576`	`578`
`577`		`- return AxisInfo(contiguity, divisibility, constancy);`
	`579`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`580`	`+ std::move(constancy));`
`578`	`581`	`}`
`579`	`582`	`};`
`580`	`583`
`@@ -608,7 +611,8 @@ class ExpandDimsOpAxisInfoVisitor final`
`608`	`611`	`contiguity.insert(contiguity.begin() + op.getAxis(), 1);`
`609`	`612`	`divisibility.insert(divisibility.begin() + op.getAxis(), newDivisibility);`
`610`	`613`	`constancy.insert(constancy.begin() + op.getAxis(), 1);`
`611`		`- return AxisInfo(contiguity, divisibility, constancy,`
	`614`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`615`	`+ std::move(constancy),`
`612`	`616`	`operands[0]->getValue().getConstantValue());`
`613`	`617`	`}`
`614`	`618`	`};`
`@@ -637,7 +641,8 @@ class BroadcastOpAxisInfoVisitor final`
`637`	`641`	`constancy.push_back(opShape[d] == 1 ? retShape[d]`
`638`	`642`	`: opInfo.getConstancy(d));`
`639`	`643`	`}`
`640`		`- return AxisInfo(contiguity, divisibility, constancy,`
	`644`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`645`	`+ std::move(constancy),`
`641`	`646`	`operands[0]->getValue().getConstantValue());`
`642`	`647`	`}`
`643`	`648`	`};`
`@@ -712,7 +717,8 @@ class CmpOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {`
`712`	`717`	`contiguity.push_back(1);`
`713`	`718`	`}`
`714`	`719`
`715`		`- return AxisInfo(contiguity, divisibility, constancy, constantValue);`
	`720`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`721`	`+ std::move(constancy), constantValue);`
`716`	`722`	`}`
`717`	`723`
`718`	`724`	`private:`
`@@ -840,7 +846,8 @@ class SelectOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {`
`840`	`846`	`constantValue = lhsInfo.getConstantValue();`
`841`	`847`	`}`
`842`	`848`
`843`		`- return AxisInfo(contiguity, divisibility, constancy, constantValue);`
	`849`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`850`	`+ std::move(constancy), constantValue);`
`844`	`851`	`}`
`845`	`852`	`};`
`846`	`853`
`@@ -993,7 +1000,8 @@ class MaxMinOpAxisInfoVisitor final : public AxisInfoVisitorImpl<OpTy> {`
`993`	`1000`	`contiguity.push_back(`
`994`	`1001`	`std::min(lhsInfo.getContiguity(d), rhsInfo.getContiguity(d)));`
`995`	`1002`	`}`
`996`		`- return AxisInfo(contiguity, divisibility, constancy, std::nullopt);`
	`1003`	`+ return AxisInfo(std::move(contiguity), std::move(divisibility),`
	`1004`	`+ std::move(constancy), std::nullopt);`
`997`	`1005`	`}`
`998`	`1006`	`}`
`999`	`1007`	`};`
`@@ -1038,7 +1046,8 @@ class MakeTensorPtrOpAxisInfoVisitor final`
`1038`	`1046`	`constancy.push_back(1);`
`1039`	`1047`	`}`
`1040`	`1048`
`1041`		`- auto axisInfo = AxisInfo(contiguity, divisibility, constancy);`
	`1049`	`+ auto axisInfo = AxisInfo(std::move(contiguity), std::move(divisibility),`
	`1050`	`+ std::move(constancy));`
`1042`	`1051`
`1043`	`1052`	`LLVM_DEBUG({`
`1044`	`1053`	`std::string axisStr;`
`@@ -1143,8 +1152,8 @@ LogicalResult AxisInfoAnalysis::visitOperation(`
`1143`	`1152`	`auto vals = cast<DenseElementsAttr>(attr).getValues<int>();`
`1144`	`1153`	`newConstancy = AxisInfo::DimVectorT(vals.begin(), vals.end());`
`1145`	`1154`	`}`
`1146`		`- curr = AxisInfo(newContiguity, newDivisibility, newConstancy,`
`1147`		`- curr.getConstantValue());`
	`1155`	`+ curr = AxisInfo(std::move(newContiguity), std::move(newDivisibility),`
	`1156`	`+ std::move(newConstancy), curr.getConstantValue());`
`1148`	`1157`	`// join all lattice elements`
`1149`	`1158`	`for (auto *result : results)`
`1150`	`1159`	`propagateIfChanged(result, result->join(curr));`
`@@ -1154,17 +1163,18 @@ LogicalResult AxisInfoAnalysis::visitOperation(`
`1154`	`1163`	`void AxisInfoAnalysis::visitForOpInductionVar(`
`1155`	`1164`	`scf::ForOp op, ArrayRef<dataflow::Lattice<AxisInfo> *> argLattices) {`
`1156`	`1165`	`ProgramPoint programPoint(op);`
`1157`		`- const auto lb =`
	`1166`	`+ const auto &lb =`
`1158`	`1167`	`getLatticeElementFor(&programPoint, op.getLowerBound())->getValue();`
`1159`		`- const auto step =`
	`1168`	`+ const auto &step =`
`1160`	`1169`	`getLatticeElementFor(&programPoint, op.getStep())->getValue();`
`1161`	`1170`
`1162`	`1171`	`AxisInfo::DimVectorT knownContiguity(1, 1);`
`1163`	`1172`	`AxisInfo::DimVectorT knownDivisibility(1, 1);`
`1164`	`1173`	`AxisInfo::DimVectorT knownConstancy(1, 1);`
`1165`	`1174`	`knownDivisibility[0] = gcd(lb.getDivisibility(0), step.getDivisibility(0));`
`1166`	`1175`	`auto inductionVar =`
`1167`		`- AxisInfo(knownContiguity, knownDivisibility, knownConstancy);`
	`1176`	`+ AxisInfo(std::move(knownContiguity), std::move(knownDivisibility),`
	`1177`	`+ std::move(knownConstancy));`
`1168`	`1178`	`(void)argLattices[0]->join(inductionVar);`
`1169`	`1179`	`}`
`1170`	`1180`