Remove MLIR_ENABLE_REMARK (#3990)

etiotto · anmyachev · whitneywhtsang · web-flow · commit ccc8ff8bba4e · 2025-04-23T20:15:22.000-04:00
Fixes issue #3987 --------- Signed-off-by: Tiotto, Ettore <ettore.tiotto@intel.com> Co-authored-by: Anatoly Myachev <anatoliimyachev@mail.com> Co-authored-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/python/test/unit/test_perf_warning.py b/python/test/unit/test_perf_warning.py
@@ -5,7 +5,7 @@
 import torch
 import triton
 import triton.language as tl
-from triton._internal_testing import is_cuda
+from triton._internal_testing import is_cuda, is_xpu
 
 
 @contextmanager
@@ -18,6 +18,8 @@ def enable_diagnostics_context(value):
 
 
 def test_mma_remark(capfd, fresh_triton_cache):
+    if is_xpu():
+        pytest.xfail("Not designed for XPU")
     if is_cuda():
         capability = torch.cuda.get_device_capability()
         if capability[0] != 9:
@@ -104,6 +106,8 @@ def matmul_kernel(
 
 
 def test_remark_vectorization(capfd, fresh_triton_cache):
+    if is_xpu():
+        pytest.xfail("Not designed for XPU")
 
     @triton.jit
     def ldst_vec(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, XBLOCK: tl.constexpr):
@@ -164,7 +168,7 @@ def ldst_vec(in_ptr0, in_ptr1, in_ptr2, in_ptr3, out_ptr0, XBLOCK: tl.constexpr)
     assert "note: diagnostic emitted with trace:" in err
 
 
-def test_remark_swp_op_before_operands(capfd, fresh_triton_cache):
+def test_remark_swp_op_before_operands(capfd, fresh_triton_cache, device):
 
     @triton.jit
     def kernel_pipe_error(in_ptr, out_ptr):
@@ -180,6 +184,6 @@ def kernel_pipe_error(in_ptr, out_ptr):
             if tl.max(val) > 0:
                 k += 1
 
-    i = torch.empty(64 * 64, dtype=torch.float32).cuda()
-    o = torch.empty(64 * 64, dtype=torch.float32).cuda()
+    i = torch.empty(64 * 64, dtype=torch.float32, device=device)
+    o = torch.empty(64 * 64, dtype=torch.float32, device=device)
     kernel_pipe_error[(1, )](i, o)
diff --git a/scripts/test-triton.sh b/scripts/test-triton.sh
@@ -214,7 +214,10 @@ run_core_tests() {
     run_pytest_command -k "not test_within_2gb" --verbose --device xpu runtime/ --ignore=runtime/test_cublas.py
 
   TRITON_TEST_SUITE=debug \
-    run_pytest_command --verbose -n ${PYTEST_MAX_PROCESSES:-8} test_debug.py --forked --device xpu
+    run_pytest_command --verbose -n ${PYTEST_MAX_PROCESSES:-8} test_debug.py test_debug_dump.py --forked --device xpu
+
+  TRITON_TEST_SUITE=warnings \
+    run_pytest_command --verbose -n ${PYTEST_MAX_PROCESSES:-8} test_perf_warning.py --device xpu
 
   # run test_line_info.py separately with TRITON_DISABLE_LINE_INFO=0
   TRITON_DISABLE_LINE_INFO=0 TRITON_TEST_SUITE=line_info \
diff --git a/third_party/intel/backend/compiler.py b/third_party/intel/backend/compiler.py
@@ -253,11 +253,6 @@ def make_ttgir(mod, metadata, opt, properties):
             cluster_info.clusterDimX = opt.cluster_dims[0]
             cluster_info.clusterDimY = opt.cluster_dims[1]
             cluster_info.clusterDimZ = opt.cluster_dims[2]
-        # Set up Diagnostic
-        if os.environ.get("MLIR_ENABLE_REMARK", "0") == "1":
-            srcMgr = llvm.source_mgr()
-            ir.source_mgr_diag(srcMgr, mod.context)
-            mod.context.printOpOnDiagnostic(True)
 
         # Annotate module with information required by subsequent transformations.
         pm = ir.pass_manager(mod.context)
@@ -330,15 +325,12 @@ def make_llir(src, metadata, options):
             metadata["num_warps"] *= num_warp_groups
         threads_per_warp = intel.get_threads_per_warp(src)
         metadata["threads_per_warp"] = threads_per_warp
+
         mod = src
         # TritonGPU -> LLVM-IR (MLIR)
         pm = ir.pass_manager(mod.context)
         pm.enable_debug()
-        # Set up Diagnostic
-        if os.environ.get("MLIR_ENABLE_REMARK", "0") == "1":
-            srcMgr = llvm.source_mgr()
-            ir.source_mgr_diag(srcMgr, mod.context)
-            mod.context.printOpOnDiagnostic(True)
+
         passes.convert.add_scf_to_cf(pm)
         passes.convert.add_index_to_llvmir(pm)
         # FIXME: Advanced path uses custom type conversion and needs hacky