[TUTORIAL] Adjust rand number range for matmul tutorial (#7505)

ravil-mobile · web-flow · commit 933cefce4ecb · 2025-07-14T16:05:24.000-07:00
This PR fixes the "Unit Test" in `03-matrix-multiplication.py` on MI300x
and MI350x GPUs. `torch.randn((512, 512), device=DEVICE,
dtype=torch.float16)` can generate relatively large absolute number in
the input which may lead to larger sums having a larger absolute
roundoff error as the exponent grows. The unit test passes on AMD MI250
and Nvidia H100 as well.
diff --git a/python/tutorials/03-matrix-multiplication.py b/python/tutorials/03-matrix-multiplication.py
@@ -161,11 +161,6 @@ def is_cuda():
     return triton.runtime.driver.active.get_current_target().backend == "cuda"
 
 
-def is_hip_cdna2():
-    target = triton.runtime.driver.active.get_current_target()
-    return target.backend == 'hip' and target.arch == 'gfx90a'
-
-
 def get_cuda_autotune_config():
     return [
         triton.Config({'BLOCK_SIZE_M': 128, 'BLOCK_SIZE_N': 256, 'BLOCK_SIZE_K': 64, 'GROUP_SIZE_M': 8}, num_stages=3,
@@ -364,17 +359,14 @@ def matmul(a, b, activation=""):
 # We can test our custom matrix multiplication operation against a native torch implementation (i.e., cuBLAS).
 
 torch.manual_seed(0)
-a = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
-b = torch.randn((512, 512), device=DEVICE, dtype=torch.float16)
+a = torch.rand((512, 512), device=DEVICE, dtype=torch.float16) - 0.5
+b = torch.rand((512, 512), device=DEVICE, dtype=torch.float16) - 0.5
 triton_output = matmul(a, b)
 torch_output = torch.matmul(a, b)
 print(f"triton_output_with_fp16_inputs={triton_output}")
 print(f"torch_output_with_fp16_inputs={torch_output}")
-# Bigger tolerance for AMD CDNA2 devices.
-# CDNA2 devices use reduced precision fp16 and bf16 and flush input and
-# output denormal values to zero. Detailed info is at: https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
-rtol = 1e-2 if is_hip_cdna2() else 0
-if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=rtol):
+
+if torch.allclose(triton_output, torch_output, atol=1e-2, rtol=0):
     print("✅ Triton and Torch match")
 else:
     print("❌ Triton and Torch differ")