Enable 09-persistent-matmul.py on Win; don't import proton (#4489)

anmyachev · web-flow · commit 74645a24c756 · 2025-06-12T16:30:20.000+02:00
To fix (from https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/15570525521/job/43845094182): ```bash File "C:\gh15570525521\python\tutorials\09-persistent-matmul.py", line 28, in <module> import triton.profiler as proton File "C:\ar\_work\intel-xpu-backend-for-triton\intel-xpu-backend-for-triton\.venv\lib\site-packages\triton\profiler\__init__.py", line 2, in <module> from .scope import scope, cpu_timed_scope, enter_scope, exit_scope File "C:\ar\_work\intel-xpu-backend-for-triton\intel-xpu-backend-for-triton\.venv\lib\site-packages\triton\profiler\scope.py", line 7, in <module> from triton._C.libproton import proton as libproton ModuleNotFoundError: No module named 'triton._C.libproton' ``` * BMG CI: https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/15593957844 (failed) * https://github.com/intel/intel-xpu-backend-for-triton/actions/runs/15607019612 (passed) --------- Signed-off-by: Anatoly Myachev <anatoly.myachev@intel.com>
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
@@ -19,18 +19,21 @@
 Note that currently this tutorial will fail on devices with a small shared memory size, such as RTX-4090.
 """
 
+import os
 import argparse
 import itertools
 
 import torch
 import triton
 import triton.language as tl
-import triton.profiler as proton
 from triton.tools.tensor_descriptor import TensorDescriptor
 from contextlib import contextmanager
 
 from typing import Optional
 
+if os.name != "nt":
+    import triton.profiler as proton
+
 DEVICE = triton.runtime.driver.active.get_active_torch_device()
 DEVICE_TOTAL_MEMORY = torch.xpu.get_device_properties().total_memory
 
@@ -625,8 +628,11 @@ def torch_matmul(a, b):
     N, K = b.shape
     bytes_per_elem = a.element_size()
     flops_str = f"flops{bytes_per_elem * 8}"
-    with proton.scope(f"torch [M={M}, N={N}, K={K}]",
-                      {"bytes": bytes_per_elem * (M * K + N * K + M * N), flops_str: 2. * M * N * K}):
+    if is_cuda():
+        with proton.scope(f"torch [M={M}, N={N}, K={K}]",
+                          {"bytes": bytes_per_elem * (M * K + N * K + M * N), flops_str: 2. * M * N * K}):
+            c = torch.matmul(a, b.T)
+    else:
         c = torch.matmul(a, b.T)
     return c