[TUTORIAL] Multiple improvements to the tutorials, especially to 09-persistent-matmul.py (#4802)

Jokeren · web-flow · commit e65dd810411a · 2024-09-26T20:00:14.000-04:00
- Format the introduction section in some tutorials.
- Add instructions for running the persistent matmul tutorial, as well
as instructions for using `proton-viewer`.
- Replace `torch.zeros` with `torch.empty` to remove unnecessary GPU
kernels.
- Add brackets `[` and `]` around shapes to improve the output
formatting.
- Remove redundant metric accumulation, as the Triton hook already
handles metric accumulation.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -25,7 +25,7 @@ jobs:
 
       - name: Install dependent packages
         run: |
-          sudo pip3 install tabulate cmake sphinx matplotlib myst_parser sphinx-rtd-theme pandas pytest sphinx-gallery sphinx-multiversion
+          sudo pip3 install tabulate cmake sphinx matplotlib myst_parser sphinx-rtd-theme pandas pytest sphinx-gallery sphinx-multiversion llnl-hatchet
 
       #- name: Fetch dependent branches
       #  run: |
diff --git a/docs/conf.py b/docs/conf.py
@@ -159,9 +159,6 @@ def documenter(app, obj, parent):
     'examples_dirs': '../python/tutorials/',
     'gallery_dirs': 'getting-started/tutorials',
     'filename_pattern': '',
-    # TODO: Re-enable the grouped-gemm tutorial.  It currently hits this
-    # assertion:
-    # https://github.com/triton-lang/triton/blob/main/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp#L127
     'ignore_pattern': r'(__init__\.py|11.*.py)',
     'within_subsection_order': FileNameSortKey,
     'reference_url': {
diff --git a/python/tutorials/06-fused-attention.py b/python/tutorials/06-fused-attention.py
@@ -3,11 +3,13 @@
 ===============
 
 This is a Triton implementation of the Flash Attention v2 algorithm from Tri Dao (https://tridao.me/publications/flash2/flash2.pdf)
+
 Credits: OpenAI kernel team
 
 Extra Credits:
-- Original flash attention paper (https://arxiv.org/abs/2205.14135)
-- Rabe and Staats (https://arxiv.org/pdf/2112.05682v2.pdf)
+
+* Original flash attention paper (https://arxiv.org/abs/2205.14135)
+* Rabe and Staats (https://arxiv.org/pdf/2112.05682v2.pdf)
 
 """
 
diff --git a/python/tutorials/07-extern-functions.py b/python/tutorials/07-extern-functions.py
@@ -3,7 +3,9 @@
 ==============================
 Triton can invoke a custom function from an external library.
 In this example, we will use the `libdevice` library to apply `asin` on a tensor.
-Please refer to https://docs.nvidia.com/cuda/libdevice-users-guide/index.html (CUDA) and/or https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs/ocml/src (HIP) regarding the semantics of all available libdevice functions.
+
+Please refer to `CUDA libdevice-users-guide <https://docs.nvidia.com/cuda/libdevice-users-guide/index.html>`_ and/or `HIP device-lib source code <https://github.com/ROCm/llvm-project/tree/amd-staging/amd/device-libs/ocml/src>`_ regarding the semantics of all available libdevice functions.
+
 In `libdevice.py`, we try to aggregate functions with the same computation but different data types together.
 For example, both `__nv_asin` and `__nv_asinf` calculate the principal value of the arc sine of the input, but `__nv_asin` operates on `double` and `__nv_asinf` operates on `float`.
 Triton automatically selects the correct underlying device function to invoke based on input and output types.
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
@@ -1,10 +1,22 @@
 """
-Persistent FP8 Matmul
+Persistent Matmul
 =====================
 This script demonstrates persistent kernel implementations of matrix multiplication using Triton.
-It includes various matmul methods, such as naive, persistent, and TMA (Tensor Memory Accelerator) based approaches, and only supports GPUs with compute capability >= 9.0.
-Triton and CuBLAS implementations are benchmarked under different configurations and evaluated using the proton profiler.
+Various matmul methods are included, such as naive, persistent, and TMA (Tensor Memory Accelerator) based approaches.
+The kernels support both FP16 and FP8 data types but the FP8 implementation is only available on CUDA devices with compute capability >= 9.0.
+
+Triton and cuBLAS implementations are benchmarked under different configurations and evaluated using the proton profiler.
 Users can pass command-line arguments to specify matrix dimensions and iteration steps flexibly.
+
+.. code-block:: bash
+
+    # FP8
+    python 09-persistent-matmul.py --prec fp8 --K_range 128 1024 --K_step 128
+
+    # FP16
+    python 09-persistent-matmul.py --prec fp16 --K_range 128 1024 --K_step 128
+
+Note that currently this tutorial will fail on devices with a small shared memory size, such as RTX-4090.
 """
 
 import argparse
@@ -36,12 +48,12 @@ def _matmul_launch_metadata(grid, kernel, args):
     ret = {}
     M, N, K = args["M"], args["N"], args["K"]
     ret["name"] = f"{kernel.name} [M={M}, N={N}, K={K}]"
-    ret["flops8"] = 2. * M * N * K
     if "c_ptr" in args:
         bytes_per_elem = args["c_ptr"].element_size()
     else:
         bytes_per_elem = 1 if args["FP8_OUTPUT"] else 2
-    ret["bytes"] = bytes_per_elem * (M * K + N * K)
+    ret[f"flops{bytes_per_elem * 8}"] = 2. * M * N * K
+    ret["bytes"] = bytes_per_elem * (M * K + N * K + M * N)
     return ret
 
 
@@ -328,7 +340,7 @@ def matmul_tma_persistent(a, b):
     N, K = b.shape
     dtype = a.dtype
 
-    c = torch.zeros((M, N), device=a.device, dtype=dtype)
+    c = torch.empty((M, N), device=a.device, dtype=dtype)
     desc_a = triton.tools.experimental_descriptor.create_2d_tma_descriptor(a.data_ptr(), M, K,
                                                                            configs[dtype]["BLOCK_SIZE_M"],
                                                                            configs[dtype]["BLOCK_SIZE_K"],
@@ -481,7 +493,7 @@ def matmul_device_tma_persistent(a, b, tiles_per_update):
     N, K = b.shape
     dtype = a.dtype
 
-    c = torch.zeros((M, N), device=a.device, dtype=dtype)
+    c = torch.empty((M, N), device=a.device, dtype=dtype)
     NUM_SMS = torch.cuda.get_device_properties("cuda").multi_processor_count
     tma_size = 128
     workspace = torch.empty(NUM_SMS * 3 * tma_size, dtype=torch.uint8, device="cuda")
@@ -511,21 +523,20 @@ def cublas_matmul(a, b):
     dtype = a.dtype
     c = torch.empty((M, N), device=a.device, dtype=dtype)
     bytes_per_elem = a.element_size()
-    flops_str = "flops8" if dtype == torch.float8_e4m3fn else "flops"
-    with proton.scope(f"cublas M={M}, N={N}, K={K}",
-                      {"bytes": bytes_per_elem * (M * K + N * K), flops_str: 2. * M * N * K}):
+    flops_str = f"flops{bytes_per_elem * 8}"
+    with proton.scope(f"cublas [M={M}, N={N}, K={K}]",
+                      {"bytes": bytes_per_elem * (M * K + N * K + M * N), flops_str: 2. * M * N * K}):
         cublas.matmul(a, b, c)
     return c
 
 
 def torch_matmul(a, b):
     M, K = a.shape
     N, K = b.shape
-    dtype = a.dtype
     bytes_per_elem = a.element_size()
-    flops_str = "flops8" if dtype == torch.float8_e4m3fn else "flops"
-    with proton.scope(f"torch M={M}, N={N}, K={K}",
-                      {"bytes": bytes_per_elem * (M * K + N * K), flops_str: 2. * M * N * K}):
+    flops_str = f"flops{bytes_per_elem * 8}"
+    with proton.scope(f"torch [M={M}, N={N}, K={K}]",
+                      {"bytes": bytes_per_elem * (M * K + N * K + M * N), flops_str: 2. * M * N * K}):
         c = torch.matmul(a, b.T)
     return c
 
@@ -558,10 +569,8 @@ def bench(K, dtype, tiles_per_update, reps=10):
         for _ in range(reps):
             matmul_tma_persistent(a, b)
             time.sleep(0.01)
-        flops_str = "flops8" if dtype == torch.float8_e4m3fn else "flops"
         with proton.scope(
-                f"matmul_kernel_device_tma_persistent M={M}, N={N}, K={K}, tiles_per_update={tiles_per_update:02}",
-            {"bytes": a.element_size() * (M * K + N * K), flops_str: 2. * M * N * K}):
+                f"matmul_kernel_device_tma_persistent [M={M}, N={N}, K={K}, tiles_per_update={tiles_per_update:02}]"):
             for _ in range(reps):
                 matmul_device_tma_persistent(a, b, tiles_per_update)
                 time.sleep(0.01)
@@ -608,6 +617,17 @@ def validate(M, N, K, dtype, tiles_per_update):
     print()
 
 
+def show_profile(precision, profile_name):
+    import triton.profiler.viewer as proton_viewer
+    metrics = ["time/ms"]
+    if precision == 'fp8':
+        metrics = ["tflop8/s"] + metrics
+    elif precision == 'fp16':
+        metrics = ["tflop16/s"] + metrics
+    file_name = f"{profile_name}.hatchet"
+    proton_viewer.parse(metrics, file_name, depth=100)
+
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument("-K", type=int, required=False, default=512)
@@ -642,3 +662,4 @@ def validate(M, N, K, dtype, tiles_per_update):
     for K in range(args.K_range[0], args.K_range[1] + 1, args.K_step):
         bench(K, dtype, args.tiles_per_update)
     proton.finalize()
+    show_profile(args.prec, "matmul")
diff --git a/third_party/proton/proton/viewer.py b/third_party/proton/proton/viewer.py
@@ -180,7 +180,7 @@ def filter_frames(gf, include=None, exclude=None, threshold=None, metric=None):
     return gf
 
 
-def parse(metrics, filename, include, exclude, threshold, depth, format):
+def parse(metrics, filename, include=None, exclude=None, threshold=None, depth=100, format=None):
     with open(filename, "r") as f:
         gf, raw_metrics, device_info = get_raw_metrics(f)
         gf = format_frames(gf, format)
@@ -190,10 +190,10 @@ def parse(metrics, filename, include, exclude, threshold, depth, format):
         # TODO: generalize to support multiple metrics, not just the first one
         gf = filter_frames(gf, include, exclude, threshold, metrics[0])
         print(gf.tree(metric_column=metrics, expand_name=True, depth=depth, render_header=False))
-        emitWarnings(gf, metrics)
+        emit_warnings(gf, metrics)
 
 
-def emitWarnings(gf, metrics):
+def emit_warnings(gf, metrics):
     if "bytes (inc)" in metrics:
         byte_values = gf.dataframe["bytes (inc)"].values
         min_byte_value = np.nanmin(byte_values)
@@ -209,7 +209,6 @@ def show_metrics(file_name):
             for raw_metric in raw_metrics:
                 raw_metric_no_unit = raw_metric.split("(")[0].strip().lower()
                 print(f"- {raw_metric_no_unit}")
-        return
 
 
 def main():