Update the timing measurements in GEMM and HBM microbenchmarks (#59)

chishuen · web-flow · commit 4280427e6035 · 2025-04-11T11:34:35.000+08:00
This update include the following changes:

* Remove the `lambda` wrapper around the benchmark function, which has some impact on performance.

* Add `--clear_caches` flags to control if the compilation and staging caches should be cleared before every run.

* Add the use of `time()` function to time the benchmark function such that users can decide how the runtime should be measured.

* Update README to explain the difference in different timing measurements.
diff --git a/microbenchmarks/README.md b/microbenchmarks/README.md
@@ -2,7 +2,7 @@
 
 ## Setup
 
-Set up a v6e TPU VM:
+Set up a v6e TPU VM for single-chip microbenchmarks:
 ```
 gcloud compute tpus tpu-vm create ${TPU_NAME} /
         --project ${PROJECT_ID} /
@@ -31,26 +31,52 @@ pip install -r requirements.txt
 Usage example:
 ```
 python benchmark_matmul.py \
-  --dim 4096 4096 4096 \
+  --dim 8192 8192 8192 \
   --libtpu_args=--xla_tpu_scoped_vmem_limit_kib=65536 \
-  --matcher="jit_matmul.*"
+  --trace_matcher="jit_matmul.*"
 ```
 
 Example output:
 ```
-dtype: bfloat16, matrix Dimensions: (4096, 4096, 4096), time taken (median): 0.16358503900000002 ms, TFLOPS: 840.1682348958574
+dtype: bfloat16, matrix dimensions: (8192, 8192, 8192), time taken (median, ms): 1.328756094, TFLOPS: 827.474382048629
 ```
 
-Run `python benchmark_matmul.py -h` to view the how to set the arguments.
+The figure below shows the trace of the example above. Setting
+ `--trace_matcher="jit_matmul.*"` means that the completion time is measured by
+ the duration of the compiled [`matmul`](benchmark_matmul.py#L19) function on
+ TPUs, which excludes the communication overheads between the host (CPU) and
+ TPUs.
 
 
+![Trace Image](https://services.google.com/fh/files/misc/trace.png)
+
+
+If `--trace_matcher` is not set, the completion time will be measured by timing
+ the function on the host, which includes the compilation and communication
+ overheads, including kernel launch, data transfer, synchronization, etc..
+
+Example:
+```
+python benchmark_matmul.py \
+  --dim 8192 8192 8192 \
+  --libtpu_args=--xla_tpu_scoped_vmem_limit_kib=65536
+```
+
+Output:
+
+```
+dtype: bfloat16, matrix dimensions: (8192, 8192, 8192), time taken (median, ms): 1.457810401916504, TFLOPS: 754.2212803054033
+```
+
+Run `python benchmark_matmul.py -h` to view the how to set the other arguments.
+
 ## HBM Bandwidth Benchmark
 
 Usage example:
 ```
 python benchmark_hbm.py \
   --num_elements=16777216 \
-  --matcher="jit_my_copy.*"
+  --trace_matcher="jit_my_copy.*"
 ```
 
 Example output:
diff --git a/microbenchmarks/benchmark_hbm.py b/microbenchmarks/benchmark_hbm.py
@@ -3,7 +3,7 @@
 Sample usage (on TPU vm):
   $ python benchmark_hbm.py \
   --num_elements=16777216 \
-  --matcher="jit_my_copy.*"
+  --trace_matcher="jit_my_copy.*"
 """
 
 import argparse
@@ -20,28 +20,30 @@ def my_copy(a):
 
 
 def get_dtype(dtype: str):
+  if dtype == "float32":
+    return jnp.float32
   if dtype == "bf16":
     return jnp.bfloat16
   if dtype == "fp8_e5m2":
     return jnp.float8_e5m2
   if dtype == "fp8_e4m3":
     return jnp.float8_e4m3fn
+  if dtype == "int8":
+    return jnp.int8
   raise ValueError(f"Invalid data type: {dtype}")
 
 
 def main():
   """Benchmark for HBM bandwidth."""
 
-  parser = argparse.ArgumentParser(
-      description="Run HBM bandwidth benchmark."
-  )
+  parser = argparse.ArgumentParser(description="Run HBM bandwidth benchmark.")
 
   parser.add_argument(
       "--dtype",
       type=str,
-      choices=["bf16", "fp8_e5m2", "fp8_e4m3"],
+      choices=["float32", "bf16", "fp8_e5m2", "fp8_e4m3", "int8"],
       default="bf16",
-      help="Data type of the matrix elements.",
+      help="Data type of the tensor elements.",
   )
   parser.add_argument(
       "--libtpu_args",
@@ -56,21 +58,21 @@ def main():
       "--num_elements",
       type=int,
       required=True,
-      help="Number of elements in the array.",
+      help="Number of elements in the tensor.",
   )
   parser.add_argument(
       "--num_iter",
       type=int,
-      default=100,
-      help="Number of times the matmul kernel will be run.",
+      default=200,
+      help="Number of times the benchmark function will be run.",
   )
   parser.add_argument(
       "--warmup_iter",
       type=int,
-      default="1",
+      default=30,
       help=(
-          "Number of times the matmul kernel will be run to warm up before the"
-          " acutal timing measurement starts."
+          "Number of times the benchmark function will be run to warm up before"
+          " the actual timing measurement starts."
       ),
   )
   parser.add_argument(
@@ -89,15 +91,23 @@ def main():
       ),
   )
   parser.add_argument(
-      "--matcher",
+      "--trace_matcher",
       type=str,
       required=False,
       help=(
           "A regex-based string matcher to filter the trace events eligible for"
-          " benchmarking. This arg would be useful if we want to measure the"
-          " timing of a specific op or XLA module within the function., e.g."
-          " --matcher='fusion' measures the timing of XLA fusion op"
-          " specifically."
+          " benchmarking. If a matcher is specified, the timing result will be"
+          " derived from the profiler trace. Otherwise, the result will be"
+          " derived from the time() wrapper."
+      ),
+  )
+  parser.add_argument(
+      "--clear_caches",
+      action=argparse.BooleanOptionalAction,
+      help=(
+          "If set, jax.clear_caches() will be invoked every time before the"
+          " benchmark function is executed, which clears all compilation and"
+          " staging caches."
       ),
   )
 
@@ -111,14 +121,16 @@ def main():
   a = jax.random.normal(jax.random.key(0), (n,)).astype(dtype)
   compiled = jax.jit(my_copy).lower(a).compile()
 
-  matcher = re.compile(args.matcher) if args.matcher else None
+  matcher = re.compile(args.trace_matcher) if args.trace_matcher else None
   result = run_bench(
-      lambda: jax.block_until_ready(compiled(a)),
+      compiled,
+      a,
       num_iter=args.num_iter,
       warmup_iter=args.warmup_iter,
       log_dir=args.log_dir,
       func_label=args.label,
-      event_matcher=matcher,
+      trace_matcher=matcher,
+      clear_caches=args.clear_caches,
   )
 
   tensor_size = n * a.itemsize
diff --git a/microbenchmarks/benchmark_matmul.py b/microbenchmarks/benchmark_matmul.py
@@ -2,9 +2,9 @@
 
 Sample usage (on TPU vm):
   $ python benchmark_matmul.py \
-  --dim 4096 4096 4096 \
+  --dim 8192 8192 8192 \
   --libtpu_args=--xla_tpu_scoped_vmem_limit_kib=65536 \
-  --matcher="jit_matmul.*"
+  --trace_matcher="jit_matmul.*"
 """
 
 import argparse
@@ -21,12 +21,16 @@ def matmul(a, b):
 
 
 def get_dtype(dtype: str):
+  if dtype == "float32":
+    return jnp.float32
   if dtype == "bf16":
     return jnp.bfloat16
   if dtype == "fp8_e5m2":
     return jnp.float8_e5m2
   if dtype == "fp8_e4m3":
     return jnp.float8_e4m3fn
+  if dtype == "int8":
+    return jnp.int8
   raise ValueError(f"Invalid data type: {dtype}")
 
 
@@ -39,7 +43,7 @@ def main():
   parser.add_argument(
       "--dtype",
       type=str,
-      choices=["bf16", "fp8_e5m2", "fp8_e4m3"],
+      choices=["float32", "bf16", "fp8_e5m2", "fp8_e4m3", "int8"],
       default="bf16",
       help="Data type of the matrix elements.",
   )
@@ -65,16 +69,16 @@ def main():
   parser.add_argument(
       "--num_iter",
       type=int,
-      default=100,
-      help="Number of times the matmul kernel will be run.",
+      default=200,
+      help="Number of times the benchmark function will be run.",
   )
   parser.add_argument(
       "--warmup_iter",
       type=int,
-      default="1",
+      default=30,
       help=(
-          "Number of times the matmul kernel will be run to warm up before the"
-          " actual timing measurement starts."
+          "Number of times the benchmark function will be run to warm up before"
+          " the actual timing measurement starts."
       ),
   )
   parser.add_argument(
@@ -93,15 +97,23 @@ def main():
       ),
   )
   parser.add_argument(
-      "--matcher",
+      "--trace_matcher",
       type=str,
       required=False,
       help=(
           "A regex-based string matcher to filter the trace events eligible for"
-          " benchmarking. This arg would be useful if we want to measure the"
-          " timing of a specific op or XLA module within the function., e.g."
-          " --matcher='fusion' measures the timing of XLA fusion op"
-          " specifically."
+          " benchmarking. If a matcher is specified, the timing result will be"
+          " derived from the profiler trace. Otherwise, the result will be"
+          " derived from the time() wrapper."
+      ),
+  )
+  parser.add_argument(
+      "--clear_caches",
+      action=argparse.BooleanOptionalAction,
+      help=(
+          "If set, jax.clear_caches() will be invoked every time before the"
+          " benchmark function is executed, which clears all compilation and"
+          " staging caches."
       ),
   )
 
@@ -116,25 +128,28 @@ def main():
   b = jax.random.normal(jax.random.key(0), (n, k)).astype(dtype)
 
   compiled = jax.jit(matmul).lower(a, b).compile()
-  matcher = re.compile(args.matcher) if args.matcher else None
+  matcher = re.compile(args.trace_matcher) if args.trace_matcher else None
   result = run_bench(
-      lambda: jax.block_until_ready(compiled(a, b)),
+      compiled,
+      a,
+      b,
       num_iter=args.num_iter,
       warmup_iter=args.warmup_iter,
       log_dir=args.log_dir,
       func_label=args.label,
-      event_matcher=matcher,
+      trace_matcher=matcher,
+      clear_caches=args.clear_caches,
   )
 
   # 2 ops (multiply and add)
   compute = m * n * k * 2
   tflops = compute / result.time_median / 1e12
 
   print(
-      f"dtype: {dtype.__name__}, matrix Dimensions: ({m}, {n}, {k}), time taken"
-      f" (median): {result.time_median * 1e3} ms, TFLOPS: {tflops}"
+      f"dtype: {dtype.__name__}, matrix dimensions: ({m}, {n}, {k}), time taken"
+      f" (median, ms): {result.time_median * 1e3}, TFLOPS: {tflops}"
   )
 
 
 if __name__ == "__main__":
-  main()
+  main()
diff --git a/microbenchmarks/benchmark_utils.py b/microbenchmarks/benchmark_utils.py