[AMD] Implement tl.extra.hip.memrealtime for timing (#7282)

ShawnZhong · dshi7 · commit 0cb53c1698be · 2025-08-07T22:42:08.000-07:00
Similar to `tl.extra.cuda.globaltimer`, this PR exposes `tl.extra.hip.memrealtime` for AMD GPU. This is useful for measuring the timing information for AMD kernels. Reference: https://www.amd.com/content/dam/amd/en/documents/radeon-tech-docs/instruction-set-architectures/rdna2-shader-instruction-set-architecture.pdf > 7.2.3. S_MEMREALTIME > This instruction reads a 64-bit "real time-counter" and returns the > value into a pair of SGPRS: > SDST and SDST+1. The time value is from a clock for which the > frequency is constant (not affected by power modes or core clock > frequency changes). > Because the instructions can return out-of-order, the only sensible > way to use this counter is to implement S_WAITCNT 0; this imposes > a wait for all data to return from previous SMEMs before continuing.
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -5962,24 +5962,29 @@ def kernel(Out):
 
 
 def test_globaltimer(device):
-    if is_hip():
-        pytest.skip("test_globaltimer is not supported in HIP")
     check_cuda_or_hip(device)
 
     @triton.jit
-    def kernel(Out1, Out2):
-        start = tl.extra.cuda.globaltimer()
+    def kernel(Out1, Out2, func: tl.constexpr):
+        start = func()
         off = tl.arange(0, 128)
         for i in range(10000):
             tl.store(Out1 + off, tl.load(Out1 + off) + 1)
-        end = tl.extra.cuda.globaltimer()
+        end = func()
         tl.store(Out2, end - start)
 
     out1 = to_triton(np.zeros((128, ), dtype=np.int64), device=device)
     out2 = to_triton(np.zeros((1, ), dtype=np.int64), device=device)
-    h = kernel[(1, )](out1, out2)
+    if is_cuda():
+        func = tl.extra.cuda.globaltimer
+    else:
+        func = tl.extra.hip.memrealtime
+    h = kernel[(1, )](out1, out2, func)
     assert out2[0] > 0
-    assert h.asm["ptx"].count("%globaltimer") == 2
+    if is_cuda():
+        assert h.asm["ptx"].count("%globaltimer") == 2
+    else:
+        assert h.asm["amdgcn"].count("s_memrealtime") == 2
 
 
 def test_smid(device):
diff --git a/third_party/amd/language/hip/__init__.py b/third_party/amd/language/hip/__init__.py
@@ -1,3 +1,5 @@
 from . import libdevice
 
-__all__ = ["libdevice"]
+from .utils import memrealtime
+
+__all__ = ["libdevice", "memrealtime"]
diff --git a/third_party/amd/language/hip/utils.py b/third_party/amd/language/hip/utils.py
@@ -0,0 +1,20 @@
+from triton.language import core
+
+
+@core.extern
+def memrealtime(_semantic=None):
+    """
+    Returns a 64-bit real time-counter value
+    """
+    return core.inline_asm_elementwise(
+        """
+        s_memrealtime $0
+        s_waitcnt vmcnt(0)
+        """,
+        "=r",
+        [],
+        dtype=core.int64,
+        is_pure=False,
+        pack=1,
+        _semantic=_semantic,
+    )