remove production name (#755)

xiaohuguo2023 · web-flow · commit 48455d1862be · 2025-03-13T20:07:34.000Z
diff --git a/python/perf-kernels/06-fused-attention-transV.py b/python/perf-kernels/06-fused-attention-transV.py
@@ -825,7 +825,7 @@ def test_op_bwd(Z, H, N_CTX, D_HEAD, dtype=torch.float16):
     assert torch.allclose(ref_out, tri_out, atol=1e-2, rtol=0)
     if torch.version.hip is None:
         assert torch.allclose(ref_dv, tri_dv, atol=1e-2, rtol=0)
-    # The current block size for MI200 series is 64x64. This results in
+    # The current block size for gfx90a and gfx908 series is 64x64. This results in
     # larger differences in float results due to rounding.
     else:
         assert torch.allclose(ref_dv, tri_dv, atol=5e-2, rtol=0)
diff --git a/python/perf-kernels/flash-attention.py b/python/perf-kernels/flash-attention.py
@@ -1833,7 +1833,7 @@ def test_op_bwd(Z, H, N_CTX, D_HEAD, qseqlen_not_equal_kseqlen, causal, torch_sd
     #print(tri_dv)
     # compare
     torch.testing.assert_close(ref_out, tri_out, atol=1e-2, rtol=0)
-    # The current block size for MI200 series is 64x64. This results in
+    # The current block size for gfx90a and gfx908 series is 64x64. This results in
     # larger differences in float results due to rounding.
 
     if dtype == torch.bfloat16:
diff --git a/python/perf-kernels/streamk/03-matrix-multiplication-stream-k.py b/python/perf-kernels/streamk/03-matrix-multiplication-stream-k.py
@@ -73,7 +73,7 @@ def _call(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, bias: torch.Tensor,
         # compute grid (work to do per SM on the first wave)
         grids = total_programs_streamk
         stride_bias = bias.stride(0) if use_bias else 0
-        # MI300X settings, MI250 set num_xcds = 1
+        # gfx942 settings, gfx90a set num_xcds = 1
         num_xcds = 8
         kk = streamk_gemm[(grids, )](
             a,
diff --git a/python/perf-kernels/streamk/README.md b/python/perf-kernels/streamk/README.md
@@ -69,7 +69,7 @@ The plan is to use this version as the base version for the future triton stream
 
 - use atomics for spinning lock to replace atomic_add for the final output.
 
-- pid renumbering based on chiplet structure of MI300X
+- pid renumbering based on chiplet structure of gfx942
 
 - dynamic grid setting
 
diff --git a/python/perf-kernels/streamk/tune_streamk.py b/python/perf-kernels/streamk/tune_streamk.py
@@ -398,7 +398,7 @@ def matmul(kernel_func, a, b, c, bias, P, locks, num_sms, block_m, block_n, bloc
     m_tiles = triton.cdiv(M, block_m)
     n_tiles = triton.cdiv(N, block_n)
     streamk_tiles = m_tiles * n_tiles % num_sms
-    # change num_xcds = 1 if using MI250
+    # change num_xcds = 1 if using gfx90a
     num_xcds = 8
     kernel_func[
         grid,
diff --git a/python/perf-kernels/streamk/utils/gemm_wrapper.py b/python/perf-kernels/streamk/utils/gemm_wrapper.py
@@ -68,7 +68,7 @@ def _call(a: torch.Tensor, b: torch.Tensor, c: torch.Tensor, bias: torch.Tensor,
         grids = min(total_programs_streamk, total_tiles)
         total_programs_streamk = min(total_programs_streamk, total_tiles)
         stride_bias = bias.stride(0) if use_bias else 0
-        # MI300X settings, MI250 set num_xcds = 1
+        # gfx942 settings, gfx90a set num_xcds = 1
         num_xcds = 8
 
         kk = streamk_gemm[(grids, )](
diff --git a/python/perf-kernels/tools/plot-layout/plot_layout.py b/python/perf-kernels/tools/plot-layout/plot_layout.py
@@ -302,7 +302,7 @@ def checkMfmaValidity(mfmaNonKDim, kWidth, kGroup, dtype_a, dtype_b, trans, scal
         dtype_a == dtype_b), f"Cannot do mixed precision mfma with {dtype_a} and {dtype_b}"
     '''
     Check mfma size according to data types
-    * refers to newly added instructions on MI350
+    * refers to newly added instructions on gfx950
     Both dtyes are f4 or fp6 or bf6
       *mfma_f32_16x16x128_f8f6f4: kWidth = 32, kGroup = 1
       *mfma_f32_32x32x64_f8f6f4: kWidth = 32, kGroup = 1