Choose device in tutorials through 'driver.active.get_current_target().backend'

anmyachev · anmyachev · commit c3ba504f5464 · 2024-11-30T17:24:35.000+01:00
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/python/tutorials/01-vector-add.py b/python/tutorials/01-vector-add.py
@@ -23,6 +23,8 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 @triton.jit
 def add_kernel(x_ptr,  # *Pointer* to first input vector.
@@ -60,7 +62,8 @@ def add_kernel(x_ptr,  # *Pointer* to first input vector.
 def add(x: torch.Tensor, y: torch.Tensor):
     # We need to preallocate the output.
     output = torch.empty_like(x)
-    assert x.is_xpu and y.is_xpu and output.is_xpu
+    is_dvc = f"is_{DEVICE}"
+    assert getattr(x, is_dvc) and getattr(y, is_dvc) and getattr(output, is_dvc)
     n_elements = output.numel()
     # The SPMD launch grid denotes the number of kernel instances that run in parallel.
     # It is analogous to CUDA launch grids. It can be either Tuple[int], or Callable(metaparameters) -> Tuple[int].
@@ -81,8 +84,8 @@ def add(x: torch.Tensor, y: torch.Tensor):
 
 torch.manual_seed(0)
 size = 98432
-x = torch.rand(size, device='xpu')
-y = torch.rand(size, device='xpu')
+x = torch.rand(size, device=DEVICE)
+y = torch.rand(size, device=DEVICE)
 output_torch = x + y
 output_triton = add(x, y)
 print(output_torch.cpu())
@@ -116,8 +119,8 @@ def add(x: torch.Tensor, y: torch.Tensor):
         args={},  # Values for function arguments not in `x_names` and `y_name`.
     ))
 def benchmark(size, provider):
-    x = torch.rand(size, device='xpu', dtype=torch.float32)
-    y = torch.rand(size, device='xpu', dtype=torch.float32)
+    x = torch.rand(size, device=DEVICE, dtype=torch.float32)
+    y = torch.rand(size, device=DEVICE, dtype=torch.float32)
     quantiles = [0.5, 0.2, 0.8]
     if provider == 'torch':
         ms, min_ms, max_ms = triton.testing.do_bench(lambda: x + y, quantiles=quantiles)
diff --git a/python/tutorials/02-fused-softmax.py b/python/tutorials/02-fused-softmax.py
@@ -27,6 +27,8 @@
 import triton.language as tl
 from triton.runtime import driver
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 def is_hip():
     return triton.runtime.driver.active.get_current_target().backend == "hip"
@@ -110,7 +112,7 @@ def softmax_kernel(output_ptr, input_ptr, input_row_stride, output_row_stride, n
 # %%
 # We can create a helper function that enqueues the kernel and its (meta-)arguments for any given input tensor.
 
-device = torch.xpu.current_device()
+device = getattr(torch, DEVICE).current_device()
 properties = driver.active.utils.get_device_properties(device)
 NUM_SM = properties["multiprocessor_count"]
 SIZE_SMEM = properties["max_shared_mem"]
@@ -194,7 +196,7 @@ def allocated_slm_size(size_smem):
 # This will allow us to verify that our padding mechanism works.
 
 torch.manual_seed(0)
-x = torch.randn(1823, 781, device='xpu')
+x = torch.randn(1823, 781, device=DEVICE)
 y_triton = softmax(x)
 y_torch = torch.softmax(x, axis=1)
 assert torch.allclose(y_triton, y_torch), (y_triton, y_torch)
@@ -226,9 +228,9 @@ def allocated_slm_size(size_smem):
         args={'M': 4096},  # values for function arguments not in `x_names` and `y_name`
     ))
 def benchmark(M, N, provider):
-    x = torch.randn(M, N, device='xpu', dtype=torch.float32)
-    stream = torch.xpu.Stream()
-    torch.xpu.set_stream(stream)
+    x = torch.randn(M, N, device=DEVICE, dtype=torch.float32)
+    stream = getattr(torch, DEVICE).Stream()
+    getattr(torch, DEVICE).set_stream(stream)
     if provider == 'torch':
         ms = triton.testing.do_bench(lambda: torch.softmax(x, axis=-1))
     if provider == 'triton':
diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py
@@ -154,6 +154,8 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 def is_cuda():
     return triton.runtime.driver.active.get_current_target().backend == "cuda"
@@ -390,8 +392,8 @@ def matmul(a, b, activation=""):
 # We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS).
 
 torch.manual_seed(0)
-a = torch.randn((512, 512), device='xpu', dtype=torch.float16)
-b = torch.randn((512, 512), device='xpu', dtype=torch.float16)
+a = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
+b = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
 triton_output = matmul(a, b)
 torch_output = torch.matmul(a, b)
 print(f"triton_output_with_fp16_inputs={triton_output}")
@@ -408,8 +410,8 @@ def matmul(a, b, activation=""):
 TORCH_HAS_FP8 = hasattr(torch, "float8_e5m2")
 if TORCH_HAS_FP8 and is_cuda():
     torch.manual_seed(0)
-    a = torch.randn((512, 512), device="xpu", dtype=torch.float16)
-    b = torch.randn((512, 512), device="xpu", dtype=torch.float16)
+    a = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
+    b = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
     a = a.to(torch.float8_e5m2)
     # pre-transpose b for efficiency.
     b = b.T
@@ -458,8 +460,8 @@ def matmul(a, b, activation=""):
 
 @triton.testing.perf_report(configs)
 def benchmark(M, N, K, provider, fp8_inputs):
-    a = torch.randn((M, K), device='xpu', dtype=torch.float16)
-    b = torch.randn((K, N), device='xpu', dtype=torch.float16)
+    a = torch.randn((M, K), device=DEVICE, dtype=torch.float16)
+    b = torch.randn((K, N), device=DEVICE, dtype=torch.float16)
     if TORCH_HAS_FP8 and fp8_inputs:
         a = a.to(torch.float8_e5m2)
         b = b.T
diff --git a/python/tutorials/04-low-memory-dropout.py b/python/tutorials/04-low-memory-dropout.py
@@ -38,6 +38,8 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 @triton.jit
 def _dropout(
@@ -71,10 +73,10 @@ def dropout(x, x_keep, p):
 
 
 # Input tensor
-x = torch.randn(size=(10, )).xpu()
+x = getattr(torch.randn(size=(10, )), DEVICE)()
 # Dropout mask
 p = 0.5
-x_keep = (torch.rand(size=(10, )) > p).to(torch.int32).xpu()
+x_keep = getattr((torch.rand(size=(10, )) > p).to(torch.int32), DEVICE)()
 #
 output = dropout(x, x_keep=x_keep, p=p)
 print(tabulate.tabulate([
@@ -138,7 +140,7 @@ def seeded_dropout(x, p, seed):
     return output
 
 
-x = torch.randn(size=(10, )).xpu()
+x = getattr(torch.randn(size=(10, )), DEVICE)()
 # Compare this to the baseline - dropout mask is never instantiated!
 output = seeded_dropout(x, p=0.5, seed=123)
 output2 = seeded_dropout(x, p=0.5, seed=123)