more tutorials

anmyachev · anmyachev · commit 06889c89a921 · 2024-11-30T23:25:14.000+01:00
Signed-off-by: Anatoly Myachev &lt;anatoly.myachev@intel.com&gt;
diff --git a/python/tutorials/05-layer-norm.py b/python/tutorials/05-layer-norm.py
@@ -34,6 +34,9 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
+
 try:
     # This is https://github.com/NVIDIA/apex, NOT the apex on PyPi, so it
     # should not be added to extras_require in setup.py.
@@ -290,7 +293,7 @@ def backward(ctx, dy):
 layer_norm = LayerNorm.apply
 
 
-def test_layer_norm(M, N, dtype, eps=1e-5, device='xpu'):
+def test_layer_norm(M, N, dtype, eps=1e-5, device=DEVICE):
     # create data
     x_shape = (M, N)
     w_shape = (x_shape[-1], )
@@ -329,7 +332,7 @@ def test_layer_norm(M, N, dtype, eps=1e-5, device='xpu'):
         plot_name='layer-norm-backward',
         args={'M': 4096, 'dtype': torch.float16, 'mode': 'backward'},
     ))
-def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device='xpu'):
+def bench_layer_norm(M, N, dtype, provider, mode='backward', eps=1e-5, device=DEVICE):
     # create data
     x_shape = (M, N)
     w_shape = (x_shape[-1], )
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
@@ -19,6 +19,8 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 def is_hip():
     return triton.runtime.driver.active.get_current_target().backend == "hip"
@@ -526,13 +528,13 @@ def backward(ctx, do):
 @pytest.mark.parametrize("causal", [True])
 def test_op(Z, H, N_CTX, HEAD_DIM, causal, dtype=torch.float16):
     torch.manual_seed(20)
-    q = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device="xpu").normal_(mean=0.0, std=0.5).requires_grad_())
-    k = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device="xpu").normal_(mean=0.0, std=0.5).requires_grad_())
-    v = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device="xpu").normal_(mean=0.0, std=0.5).requires_grad_())
+    q = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    k = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
+    v = (torch.empty((Z, H, N_CTX, HEAD_DIM), dtype=dtype, device=DEVICE).normal_(mean=0.0, std=0.5).requires_grad_())
     sm_scale = 0.5
     dout = torch.randn_like(q)
     # reference implementation
-    M = torch.tril(torch.ones((N_CTX, N_CTX), device="xpu"))
+    M = torch.tril(torch.ones((N_CTX, N_CTX), device=DEVICE))
     p = torch.matmul(q, k.transpose(2, 3)) * sm_scale
     if causal:
         p[:, :, M == 0] = float("-inf")
@@ -600,7 +602,7 @@ def test_op(Z, H, N_CTX, HEAD_DIM, causal, dtype=torch.float16):
 
 
 @triton.testing.perf_report(configs)
-def bench_flash_attention(BATCH, H, N_CTX, HEAD_DIM, causal, mode, provider, device="xpu"):
+def bench_flash_attention(BATCH, H, N_CTX, HEAD_DIM, causal, mode, provider, device=DEVICE):
     assert mode in ["fwd", "bwd"]
     dtype = torch.float16
     if "triton" in provider:
diff --git a/python/tutorials/07-extern-functions.py b/python/tutorials/07-extern-functions.py
@@ -25,6 +25,8 @@
 
 from pathlib import Path
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 @triton.jit
 def asin_kernel(
@@ -49,8 +51,8 @@ def asin_kernel(
 
 torch.manual_seed(0)
 size = 98432
-x = torch.rand(size, device='xpu')
-output_triton = torch.zeros(size, device='xpu')
+x = torch.rand(size, device=DEVICE)
+output_triton = torch.zeros(size, device=DEVICE)
 output_torch = torch.asin(x)
 n_elements = output_torch.numel()
 grid = lambda meta: (triton.cdiv(n_elements, meta['BLOCK_SIZE']), )
diff --git a/python/tutorials/08-grouped-gemm.py b/python/tutorials/08-grouped-gemm.py
@@ -31,6 +31,8 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 def is_cuda():
     return triton.runtime.driver.active.get_current_target().backend == "cuda"
@@ -145,7 +147,7 @@ def grouped_matmul_kernel(
 
 
 def group_gemm_fn(group_A, group_B):
-    device = torch.device('xpu')
+    device = torch.device(DEVICE)
     assert len(group_A) == len(group_B)
     group_size = len(group_A)
 
@@ -201,8 +203,8 @@ def group_gemm_fn(group_A, group_B):
     M = group_m[i]
     N = group_n[i]
     K = group_k[i]
-    A = torch.rand((M, K), device="xpu", dtype=torch.float16)
-    B = torch.rand((K, N), device="xpu", dtype=torch.float16)
+    A = torch.rand((M, K), device=DEVICE, dtype=torch.float16)
+    B = torch.rand((K, N), device=DEVICE, dtype=torch.float16)
     group_A.append(A)
     group_B.append(B)
 
@@ -264,9 +266,9 @@ def benchmark(N, provider):
     g_lds = []
     group_C = []
     for i in range(group_size):
-        A = torch.rand((N, N), device="xpu", dtype=torch.float16)
-        B = torch.rand((N, N), device="xpu", dtype=torch.float16)
-        C = torch.empty((N, N), device="xpu", dtype=torch.float16)
+        A = torch.rand((N, N), device=DEVICE, dtype=torch.float16)
+        B = torch.rand((N, N), device=DEVICE, dtype=torch.float16)
+        C = torch.empty((N, N), device=DEVICE, dtype=torch.float16)
         group_A.append(A)
         group_B.append(B)
         group_C.append(C)
@@ -276,11 +278,11 @@ def benchmark(N, provider):
         g_sizes += [N, N, N]
         g_lds += [N, N, N]
 
-    d_a_ptrs = torch.tensor(A_addrs, device="xpu")
-    d_b_ptrs = torch.tensor(B_addrs, device="xpu")
-    d_c_ptrs = torch.tensor(C_addrs, device="xpu")
-    d_g_sizes = torch.tensor(g_sizes, dtype=torch.int32, device="xpu")
-    d_g_lds = torch.tensor(g_lds, dtype=torch.int32, device="xpu")
+    d_a_ptrs = torch.tensor(A_addrs, device=DEVICE)
+    d_b_ptrs = torch.tensor(B_addrs, device=DEVICE)
+    d_c_ptrs = torch.tensor(C_addrs, device=DEVICE)
+    d_g_sizes = torch.tensor(g_sizes, dtype=torch.int32, device=DEVICE)
+    d_g_lds = torch.tensor(g_lds, dtype=torch.int32, device=DEVICE)
 
     quantiles = [0.5, 0.2, 0.8]
     if provider == ref_lib.lower():
diff --git a/python/tutorials/10-experimental-block-pointer.py b/python/tutorials/10-experimental-block-pointer.py
@@ -95,6 +95,8 @@
 import triton
 import triton.language as tl
 
+DEVICE = triton.runtime.driver.active.get_current_target().backend
+
 
 @triton.autotune(
     configs=[
@@ -345,23 +347,23 @@ def matmul(a, b, accum_dtype, res_dtype):
                 #  [ 1  1  1 ... ],
                 #  [ 0  1  1 ... ], ... ]
                 # in order only add 3 values per result matrix element.
-                a = torch.randn(shape, device='xpu', dtype=dtype)
-                b = torch.eye(shape[-2], device='xpu', dtype=dtype) + torch.diag(
-                    torch.ones(shape[-2] - 1, device='xpu', dtype=dtype), diagonal=1) + torch.diag(
-                        torch.ones(shape[-2] - 1, device='xpu', dtype=dtype), diagonal=-1)
+                a = torch.randn(shape, device=DEVICE, dtype=dtype)
+                b = torch.eye(shape[-2], device=DEVICE, dtype=dtype) + torch.diag(
+                    torch.ones(shape[-2] - 1, device=DEVICE, dtype=dtype), diagonal=1) + torch.diag(
+                        torch.ones(shape[-2] - 1, device=DEVICE, dtype=dtype), diagonal=-1)
                 # duplicate b on batch dimension.
                 if len(shape) == 3:
                     b = b.unsqueeze(0).repeat(shape[0], 1, 1)
             else:
-                a = torch.randn(shape, device='xpu', dtype=dtype)
-                b = torch.randn(shape, device='xpu', dtype=dtype)
+                a = torch.randn(shape, device=DEVICE, dtype=dtype)
+                b = torch.randn(shape, device=DEVICE, dtype=dtype)
             torch_output = torch.matmul(a, b).to(dtype=res_dtype)
         else:
-            a = torch.randint(low=-127, high=128, size=shape, device='xpu', dtype=dtype)
-            b = torch.randint(low=-127, high=128, size=shape, device='xpu', dtype=dtype)
+            a = torch.randint(low=-127, high=128, size=shape, device=DEVICE, dtype=dtype)
+            b = torch.randint(low=-127, high=128, size=shape, device=DEVICE, dtype=dtype)
             # torch.matmul clamps values to input dtype; IPEX doesn't support int32 matmul
             torch_output = torch.matmul(a.to(device='cpu', dtype=accum_dtype),
-                                        b.to(device='cpu', dtype=accum_dtype)).to(device='xpu', dtype=res_dtype)
+                                        b.to(device='cpu', dtype=accum_dtype)).to(device=DEVICE, dtype=res_dtype)
 
         triton_output = matmul(a, b, accum_dtype, res_dtype)
 
@@ -408,8 +410,8 @@ def matmul(a, b, accum_dtype, res_dtype):
 
 @triton.testing.perf_report(configs)
 def benchmark(M, N, K, provider):
-    a = torch.randn((M, K), device='xpu', dtype=torch.float16)
-    b = torch.randn((K, N), device='xpu', dtype=torch.float16)
+    a = torch.randn((M, K), device=DEVICE, dtype=torch.float16)
+    b = torch.randn((K, N), device=DEVICE, dtype=torch.float16)
 
     quantiles = [0.5, 0.2, 0.8]
     if provider == ref_lib.lower():