Fix test_gather_warp_shuffle (#3088)

zhiczhong · chengjunlu · web-flow · commit b59bb9aebd7c · 2025-01-06T23:34:27.000-05:00
Co-authored-by: Lu,Chengjun &lt;chengjun.lu@intel.com&gt;
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -6490,8 +6490,6 @@ def triton_gather(src: torch.Tensor, axis: int, indices: torch.Tensor):
 ])
 def test_gather_warp_shuffle(src_shape, indices_shape, axis, src_layout, indices_layout, tmp_path: pathlib.Path,
                              device):
-    if is_xpu():
-        pytest.skip("warp-local gather has issues on XPU")
     if is_hip():
         pytest.skip("warp-local gather has issues on HIP")
 
@@ -6517,13 +6515,13 @@ def inject_layout(ir, src: torch.Tensor, axis, indices: torch.Tensor, src_layout
 
         pat = r"(%[0-9]+) = tt.gather (%[0-9]+)\[(%[0-9]+)\] {axis = "
         pat += str(axis)
-        pat += r" : i32} : \(tensor\<"
+        pat += r" : i32[, efficient_layout]*} : \(tensor\<"
         pat += src_spec
-        pat += r", (#[a-z]+[0-9]+)\>, tensor\<"
+        pat += r", (#[a-z]+[0-9]*)\>, tensor\<"
         pat += indices_spec
-        pat += r", (#[a-z]+[0-9]+)\>\) -> tensor\<"
+        pat += r", (#[a-z]+[0-9]*)\>\) -> tensor\<"
         pat += output_spec
-        pat += r", (#[a-z]+[0-9]+)\>"
+        pat += r", (#[a-z]+[0-9]*)\>"
 
         repl = r"""
     %src = ttg.convert_layout \2 : tensor<""" + src_spec + r""", \4> -> tensor<""" + src_spec + r""", #src_layout>
@@ -6546,7 +6544,9 @@ def inject_layout(ir, src: torch.Tensor, axis, indices: torch.Tensor, src_layout
     temp_file.write_text(ir)
 
     kernel = triton.compile(str(temp_file))
-    assert ("nvvm.shfl.sync.idx" in kernel.asm["llir"]) or ("llvm.amdgcn.ds.bpermute" in kernel.asm["llir"])
+    assert ("nvvm.shfl.sync.idx" in kernel.asm["llir"]) or ("llvm.amdgcn.ds.bpermute"
+                                                            in kernel.asm["llir"]) or ("_Z17sub_group_shufflefj"
+                                                                                       in kernel.asm["llir"])
 
     kernel[(1, 1, 1)](src, indices, output)
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -896,11 +896,13 @@ struct ConvertLayoutOpUsingLinearLayoutsConversion
 void mlir::triton::intel::populateConvertLayoutOpToLLVMPatterns(
     LLVMTypeConverter &typeConverter, const TargetInfo &targetInfo,
     RewritePatternSet &patterns, PatternBenefit benefit) {
-  // We prefer using the linear layout conversion, so it gets a higher benefit.
-  // Eventually the LL conversion will subsume all of the others and be the only
-  // one left.
+  // We prefer using the Intel specific linear layout conversion, so it gets a
+  // higher benefit. Eventually the LL conversion will subsume all of the others
+  // and be the only one left.
   patterns.add<gpu::ConvertLayoutOpUsingLinearLayoutsConversion>(
-      typeConverter, targetInfo, benefit.getBenefit() + 1);
+      typeConverter, targetInfo, benefit.getBenefit() + 2);
   patterns.add<gpu::ConvertLayoutOpConversion>(typeConverter, targetInfo,
-                                               benefit);
+                                               benefit.getBenefit() + 1);
+  mlir::triton::populateConvertLayoutOpToLLVMPatterns(typeConverter, targetInfo,
+                                                      patterns, benefit);
 }