Merge branch 'main' into xu_fix_build_fail_message

xuhancn · web-flow · commit 2f0e2545ad4f · 2025-12-04T20:16:17.000+08:00
diff --git a/.github/workflows/inductor-tests.yml b/.github/workflows/inductor-tests.yml
@@ -52,6 +52,7 @@ env:
     inductor/test_select_algorithm.py
     inductor/test_max_autotune.py
     inductor/test_compile_subprocess.py
+    inductor/test_analysis.py
 
 jobs:
   compute-params:
diff --git a/.github/workflows/try-latest-pytorch.yml b/.github/workflows/try-latest-pytorch.yml
@@ -96,6 +96,7 @@ jobs:
         inductor/test_select_algorithm.py
         inductor/test_max_autotune.py
         inductor/test_compile_subprocess.py
+        inductor/test_analysis.py
       runner_label: ${{ inputs.runner_label }}
       python_version: "3.10"
 
diff --git a/python/test/unit/intel/test_mxfp_matmul.py b/python/test/unit/intel/test_mxfp_matmul.py
@@ -36,9 +36,9 @@ def mxfp_matmul(  #
         a_scale, b_scale,  #
         M, N, K,  #
         stride_scale,  #
-        stride_am, stride_ak,  #
-        stride_bk, stride_bn,  #
-        stride_cm, stride_cn,  #
+        stride_am: tl.constexpr, stride_ak: tl.constexpr,  #
+        stride_bk: tl.constexpr, stride_bn: tl.constexpr,  #
+        stride_cm: tl.constexpr, stride_cn: tl.constexpr,  #
         DTYPE_A: tl.constexpr,  #
         DTYPE_B: tl.constexpr,  #
         BLOCK_M: tl.constexpr,  #
diff --git a/python/test/unit/language/test_matmul.py b/python/test/unit/language/test_matmul.py
@@ -1252,9 +1252,9 @@ def create_operand(dtype: str, size0: int, size1: int, k_dim: int, transpose: bo
     kernel_kwargs = {}
     if is_hip():
         kernel_kwargs["matrix_instr_nonkdim"] = nonKDim
-    if is_xpu() and (128, 256, 256) == (BLOCK_M, BLOCK_N, BLOCK_K) and not CONST_SCALE and not PACK_B_ALONG_K:
-        kernel_kwargs["num_warps"] = 8
     if is_xpu():
+        # since the block size are big we use num_warps = 32 to avoid pressure problems.
+        kernel_kwargs["num_warps"] = 32
         kernel_kwargs["grf_mode"] = "256"
     out = mxfp8_mxfp4_matmul[grid](a, b, output, a_scale, b_scale, M, N, K, stride_scale, a.stride(0), a.stride(1),
                                    b.stride(0), b.stride(1), output.stride(0), output.stride(1), not CONST_SCALE,
diff --git a/scripts/skiplist/lts/triton_kernels.txt b/scripts/skiplist/lts/triton_kernels.txt
@@ -1,2 +0,0 @@
-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5074
-tests/test_matmul.py::test_op
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/RemoveLayoutConversions.cpp
@@ -172,8 +172,9 @@ class LayoutRematerialization {
   void reduceLoopCarriedValues();
   // Existing tuples of (value, layout) that needs to be updated when recreating
   // scf ops. This prevents keeping track of Values that have been delete when
-  // rewriting slices.
-  DenseMap<Value, Attribute> mappedValues;
+  // rewriting slices. The Value maybe mapped to different attributes in remove
+  // layout.
+  DenseMap<Value, SmallVector<Attribute>> mappedValues;
   // map of the values remat based on encoding.
   DenseMap<std::pair<Value, Attribute>, Value> rematMapping;
   // DenseMap<std::pair<Operation*, Attribute>, Operation*>
@@ -187,7 +188,10 @@ void LayoutRematerialization::addRematValue(Value old, Attribute encoding,
                                             Value newV) {
   LDBG("addRematValue " << old << " encoding " << encoding << " " << newV);
   rematMapping[{old, encoding}] = newV;
-  mappedValues[old] = encoding;
+  if (mappedValues.contains(old))
+    mappedValues[old].push_back(encoding);
+  else
+    mappedValues[old] = {encoding};
 }
 
 // Remove unneeded values now that we are done with the rematMapping.
@@ -992,22 +996,27 @@ void LayoutRematerialization::updateRematMapping(
   for (auto [old, newV] : values) {
     auto it = mappedValues.find(old);
     if (it != mappedValues.end()) {
-      Attribute encoding = it->second;
-      auto rematIt = rematMapping.find({old, it->second});
-      assert(rematIt != rematMapping.end());
-      Value replacedValue = rematIt->second;
-      rematMapping.erase(rematIt);
-      mappedValues.erase(it);
-      // Loop through the replacement value to find the new version of remat
-      // value. This should be okay as the number of values should be small.
-      for (auto [before, after] : values) {
-        if (before == replacedValue) {
-          replacedValue = after;
-          break;
+      SmallVector<Attribute> encodings = it->second;
+      for (Attribute encoding : encodings) {
+        auto rematIt = rematMapping.find({old, encoding});
+        assert(rematIt != rematMapping.end());
+        Value replacedValue = rematIt->second;
+        rematMapping.erase(rematIt);
+        // Loop through the replacement value to find the new version of remat
+        // value. This should be okay as the number of values should be small.
+        for (auto [before, after] : values) {
+          if (before == replacedValue) {
+            replacedValue = after;
+            break;
+          }
         }
+        rematMapping[{newV, encoding}] = replacedValue;
       }
-      rematMapping[{newV, encoding}] = replacedValue;
-      mappedValues[newV] = encoding;
+      mappedValues.erase(it);
+      if (mappedValues.contains(newV))
+        mappedValues[newV].append(encodings);
+      else
+        mappedValues[newV] = std::move(encodings);
     }
   }
 }

Original file line number	Diff line number	Diff line change
`@@ -1,2 +0,0 @@`
`1`		`-# https://github.com/intel/intel-xpu-backend-for-triton/issues/5074`
`2`		`-tests/test_matmul.py::test_op`