Merge pull request #757 from ROCm/ravil/rocm-triton-perf

ravil-mobile · web-flow · commit 0e3e441ccbe7 · 2025-03-18T15:50:12.000+01:00
[AMD] Added a script that computes TFLOP/s using perf. counters
diff --git a/python/perf-kernels/tools/rocm-triton-prof/README.md b/python/perf-kernels/tools/rocm-triton-prof/README.md
@@ -0,0 +1,135 @@
+### Install Latest rocprofv3 from sources
+
+Compile that latest rocprofv3  from sources (use `amd-staging`  branch)
+
+```bash
+cd ~
+mkdir -p ~/usr/rocprofv3
+INSTALL_DIR=$(realpath ~/usr/rocprofv3)
+git clone https://github.com/rocm/rocprofiler-sdk
+cd rocprofiler-sdk
+mkdir -p build && cd build
+cmake .. -DCMAKE_INSTALL_PREFIX=${INSTALL_DIR} -DROCPROFILER_BUILD_TESTS=ON -DROCPROFILER_BUILD_SAMPLES=ON
+make -j
+make install
+```
+
+Set the corresponding env. variables
+
+```bash
+$ cat ~/load.rocprofv3.sh
+#!/bin/bash
+
+INSTALL_DIR=$(realpath ~/usr/rocprofv3)
+
+export PATH=${INSTALL_DIR}/bin:${PATH}
+export LD_LIBRARY_PATH=${INSTALL_DIR}/lib:${LD_LIBRARY_PATH}
+export LIBRARY_PATH=${INSTALL_DIR}/lib:${LIBRARY_PATH}
+
+$ source ~/load.rocprofv3.sh
+```
+
+### Adjust Triton Source Code
+
+The `flash-attention.py` kernel comes with auto-tuning. In this example, we want to measure performance of the best performing FA configuration. Run the kernel with the enabled auto-tuner.
+
+
+```bash
+$ TRITON_PRINT_AUTOTUNING=1 python3 ./flash-attention.py -b 2 -hq 16 -hk 16 -sq 8192 -sk 8192 -d 128 -causal -layout thd
+Autotuning kernel attn_fwd with config BLOCK_M: 128, BLOCK_N: 128, waves_per_eu: 2, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, instruction_sched_variant: none, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None
+Autotuning kernel attn_fwd with config BLOCK_M: 128, BLOCK_N: 64, waves_per_eu: 2, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, instruction_sched_variant: none, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None
+Autotuning kernel attn_fwd with config BLOCK_M: 128, BLOCK_N: 64, waves_per_eu: 3, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, instruction_sched_variant: none, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None
+Autotuning kernel attn_fwd with config BLOCK_M: 128, BLOCK_N: 64, waves_per_eu: 1, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, instruction_sched_variant: none, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None
+Autotuning kernel attn_fwd with config BLOCK_M: 128, BLOCK_N: 32, waves_per_eu: 2, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, instruction_sched_variant: none, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None
+Triton autotuning for function attn_fwd finished after 15.06s; best config selected: BLOCK_M: 128, BLOCK_N: 64, waves_per_eu: 2, PRE_LOAD_V: False, GRID_CU_MULTIP: 2, instruction_sched_variant: none, num_warps: 4, num_ctas: 1, num_stages: 1, maxnreg: None;
+fused-attention-fwd-d128-layoutthd:
+   BATCH    HQ    HK  N_CTX_Q  N_CTX_K      triton      torch
+0    2.0  16.0  16.0   8192.0   8192.0  221.869662  17.140226
+```
+
+
+Open the script and find the function which sets tuning parameters (i.e., `get_cdna_autotune_configs`). You can see that the function returns a list of suggested configs to the tuner. Comment everything except the winning config that we found in the previous step. For example,
+
+
+```python
+def get_cdna_autotune_configs():
+    return [
+        #triton.Config({'BLOCK_M': 128, 'BLOCK_N': 128, 'waves_per_eu': 2, 'PRE_LOAD_V': False, 'GRID_CU_MULTIP': 2},
+        #              num_stages=1, num_warps=4),
+        triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 2, 'PRE_LOAD_V': False, 'GRID_CU_MULTIP': 2},
+                      num_stages=1, num_warps=4),
+        #triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 3, 'PRE_LOAD_V': False, 'GRID_CU_MULTIP': 2},
+        #              num_stages=1, num_warps=4),
+        #triton.Config({'BLOCK_M': 128, 'BLOCK_N': 64, 'waves_per_eu': 1, 'PRE_LOAD_V': False, 'GRID_CU_MULTIP': 2},
+        #              num_stages=1, num_warps=4),
+        #triton.Config({'BLOCK_M': 128, 'BLOCK_N': 32, 'waves_per_eu': 2, 'PRE_LOAD_V': False, 'GRID_CU_MULTIP': 2},
+        #              num_stages=1, num_warps=4),
+    ], ['IS_CAUSAL', 'dropout_p', 'MAX_SEQLENS_Q', 'MAX_SEQLENS_K', 'ACTUAL_BLOCK_DMODEL', 'VARLEN', 'HQ', 'HK']
+
+```
+
+
+### Collect Performance Data
+
+Make a softlink to `./rocm-triton-prof.py` in the directory where you perform the test. For example,
+
+```bash
+ln -s <rocm-triton-dir>/python/perf-kernels/tools/rocm-triton-prof/rocm-triton-prof.py rocm-triton-prof.py
+```
+
+Run the tool as follows:
+
+```bash
+$ python3 ./rocm-triton-prof.py --kernel attn_fwd --cmd python3 ./flash-attention.py -b 2 -hq 16 -hk 16 -sq 8192 -sk 8192 -d 128 -causal -layout thd
+Timing info in `nsec`:
+count       269.000000
+mean     326119.100372
+std        7120.765559
+min      304946.000000
+25%      322147.000000
+50%      327960.000000
+75%      331047.000000
+max      352857.000000
+dtype: float64
+
+NON-FLOP related data:
+   Counter Name        Max        Min          Mean     Median
+0    GRBM_COUNT  8955952.0  4043501.0  4.284156e+06  4261916.0
+1   TCC_HIT_sum  5347185.0  4074880.0  4.112117e+06  4107955.0
+2  TCC_MISS_sum  5932281.0  3526537.0  3.572396e+06  3556786.5
+
+FLOP related data:
+                    Counter Name     Raw Data          FLOP  Relative FLOP, %
+0          SQ_INSTS_VALU_ADD_F16          0.0  0.000000e+00          0.000000
+1          SQ_INSTS_VALU_MUL_F16          0.0  0.000000e+00          0.000000
+2          SQ_INSTS_VALU_FMA_F16     192512.0  2.464154e+07          0.030844
+3        SQ_INSTS_VALU_TRANS_F16          0.0  0.000000e+00          0.000000
+4          SQ_INSTS_VALU_ADD_F32    4898176.0  3.134833e+08          0.392393
+5          SQ_INSTS_VALU_MUL_F32    2411456.0  1.543332e+08          0.193182
+6          SQ_INSTS_VALU_FMA_F32    2486720.0  3.183002e+08          0.398422
+7        SQ_INSTS_VALU_TRANS_F32    2489728.0  1.593426e+08          0.199452
+8          SQ_INSTS_VALU_ADD_F64          0.0  0.000000e+00          0.000000
+9          SQ_INSTS_VALU_MUL_F64          0.0  0.000000e+00          0.000000
+10         SQ_INSTS_VALU_FMA_F64          0.0  0.000000e+00          0.000000
+11       SQ_INSTS_VALU_TRANS_F64          0.0  0.000000e+00          0.000000
+12   SQ_INSTS_VALU_MFMA_MOPS_F16  154140672.0  7.892002e+10         98.785706
+13  SQ_INSTS_VALU_MFMA_MOPS_BF16          0.0  0.000000e+00          0.000000
+14   SQ_INSTS_VALU_MFMA_MOPS_F32          0.0  0.000000e+00          0.000000
+15   SQ_INSTS_VALU_MFMA_MOPS_F64          0.0  0.000000e+00          0.000000
+
+Performance info in TFLOP/s:
+count    269.000000
+mean     245.090089
+std        5.420713
+min      226.409352
+25%      241.325627
+50%      243.597161
+75%      247.992764
+max      261.981219
+dtype: float64
+```
+
+### Known limits
+
+The tool currently supports only FP64, FP32 and FP16 operations.
+Note, it can be extended to supoprt other data types.
diff --git a/python/perf-kernels/tools/rocm-triton-prof/rocm-triton-prof.py b/python/perf-kernels/tools/rocm-triton-prof/rocm-triton-prof.py
@@ -0,0 +1,256 @@
+#!/usr/bin/python3
+
+import argparse
+import os
+import pandas as pd
+import yaml
+import subprocess
+import shutil
+import re
+from collections import OrderedDict
+
+
+def get_perf_metrics():
+    pmc0 = OrderedDict()
+    pmc0['SQ_INSTS_VALU_ADD_F16'] = {'value': 0, 'factor': 64, 'flop': 0}
+    pmc0['SQ_INSTS_VALU_MUL_F16'] = {'value': 0, 'factor': 64, 'flop': 0}
+    pmc0['SQ_INSTS_VALU_FMA_F16'] = {'value': 0, 'factor': 128, 'flop': 0}
+    pmc0['SQ_INSTS_VALU_TRANS_F16'] = {'value': 0, 'factor': 64, 'flop': 0}
+
+    pmc1 = OrderedDict()
+    pmc1['SQ_INSTS_VALU_ADD_F32'] = {'value': 0, 'factor': 64, 'flop': 0}
+    pmc1['SQ_INSTS_VALU_MUL_F32'] = {'value': 0, 'factor': 64, 'flop': 0}
+    pmc1['SQ_INSTS_VALU_FMA_F32'] = {'value': 0, 'factor': 128, 'flop': 0}
+    pmc1['SQ_INSTS_VALU_TRANS_F32'] = {'value': 0, 'factor': 64, 'flop': 0}
+
+    pmc2 = OrderedDict()
+    pmc2['SQ_INSTS_VALU_ADD_F64'] = {'value': 0, 'factor': 64, 'flop': 0}
+    pmc2['SQ_INSTS_VALU_MUL_F64'] = {'value': 0, 'factor': 64, 'flop': 0}
+    pmc2['SQ_INSTS_VALU_FMA_F64'] = {'value': 0, 'factor': 128, 'flop': 0}
+    pmc2['SQ_INSTS_VALU_TRANS_F64'] = {'value': 0, 'factor': 64, 'flop': 0}
+
+    pmc3 = OrderedDict()
+    pmc3['SQ_INSTS_VALU_MFMA_MOPS_F16'] = {'value': 0, 'factor': 512, 'flop': 0}
+    pmc3['SQ_INSTS_VALU_MFMA_MOPS_BF16'] = {'value': 0, 'factor': 512, 'flop': 0}
+    pmc3['SQ_INSTS_VALU_MFMA_MOPS_F32'] = {'value': 0, 'factor': 512, 'flop': 0}
+    pmc3['SQ_INSTS_VALU_MFMA_MOPS_F64'] = {'value': 0, 'factor': 512, 'flop': 0}
+
+    pmc4 = OrderedDict()
+    pmc4['GRBM_COUNT'] = {'value': 0}
+    pmc4['TCC_HIT_sum'] = {'value': 0}
+    pmc4['TCC_MISS_sum'] = {'value': 0}
+
+    jobs = OrderedDict()
+    jobs[0] = pmc0
+    jobs[1] = pmc1
+    jobs[2] = pmc2
+    jobs[3] = pmc3
+    jobs[4] = pmc4
+    return jobs
+
+
+def get_metrics_as_yaml():
+    perf_metrics = get_perf_metrics()
+    pmcs = [
+        {'pmc': list(perf_metrics[0].keys())},
+        {'pmc': list(perf_metrics[1].keys())},
+        {'pmc': list(perf_metrics[2].keys())},
+        {'pmc': list(perf_metrics[3].keys())},
+        {'pmc': list(perf_metrics[4].keys())},
+    ]
+
+    spec = {}
+    spec['jobs'] = pmcs
+
+    spec_str = yaml.dump(spec)
+    return spec_str
+
+
+def run_external_binary(binary_path, arguments=[], verbose=False):
+    try:
+        # Run the external binary and capture its standard output
+        cmd = [binary_path] + arguments if binary_path else arguments
+        if verbose:
+            print(f"CURR.CMD: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+
+        # Check if the process was successful
+        if result.returncode == 0:
+            if result.stderr:
+                print(result.stderr.strip())
+            return result.stdout.strip()
+        else:
+            cmd = ' '.join(cmd)
+            raise RuntimeError(f'Error: The external binary returned non-zero exit code {result.returncode}. '
+                               f'Attempted command:\n{cmd}')
+    except FileNotFoundError:
+        raise RuntimeError(f'Error: The binary could not be found - i.e., {binary_path}')
+    except Exception as err:
+        raise RuntimeError(f'Error: {str(err)}')
+
+
+def check_rocprofv3():
+    run_external_binary('which', ['rocprofv3'])
+
+
+def find_file(rootdir, regex):
+    for root, _, files in os.walk(rootdir):
+        for file in files:
+            if regex.match(file):
+                return os.path.join(root, file)
+
+
+def filter(df, name):
+    return df[df['Kernel_Name'] == name]
+
+
+def process_files(metrics_dir, timing_dir, kernel_name, verbose):
+    timing_file = find_file(timing_dir, re.compile(r'.*kernel_trace.csv'))
+    df = pd.read_csv(timing_file)
+    df = filter(df, kernel_name)
+    timing = df['End_Timestamp'] - df['Start_Timestamp']
+    print('Timing info in `nsec`:')
+    print(timing.describe())
+    print()
+
+    # post process all passes
+    num_flop_sum = 0
+    perf_metrics = get_perf_metrics()
+    num_passes = 5
+    metrics_file_regex = re.compile(r'.*counter_collection.csv')
+    for pass_id in range(1, num_passes + 1):
+        search_dir = os.path.join(metrics_dir, f'pass_{pass_id}')
+        metrics_file = find_file(search_dir, metrics_file_regex)
+        df = pd.read_csv(metrics_file)
+        df = filter(df, kernel_name)
+
+        curr_metrics = perf_metrics[pass_id - 1]
+        curr_metrics_names = list(curr_metrics.keys())
+        for name in curr_metrics_names:
+            data = df[df['Counter_Name'] == name]
+            value = data['Counter_Value'].mean()
+
+            if 'flop' in curr_metrics[name].keys():
+                curr_metrics[name]['value'] = value
+                num_flops = value * curr_metrics[name]['factor']
+
+                num_flop_sum += num_flops
+                curr_metrics[name]['flop'] = num_flops
+            else:
+                curr_metrics[name]['value'] = value
+    print()
+
+    # Print data from non-flop-passes
+    print('NON-FLOP related data:')
+    table = {'Counter Name': [], 'Max': [], 'Min': [], 'Mean': [], 'Median': []}
+    non_flop_passes = [5]
+    for pass_id in non_flop_passes:
+        search_dir = os.path.join(metrics_dir, f'pass_{pass_id}')
+        metrics_file = find_file(search_dir, metrics_file_regex)
+        df = pd.read_csv(metrics_file)
+        df = filter(df, kernel_name)
+
+        curr_metrics = perf_metrics[pass_id - 1]
+        curr_metrics_names = list(curr_metrics.keys())
+        for name in curr_metrics_names:
+            data = df[df['Counter_Name'] == name]
+            values = data['Counter_Value']
+            table['Counter Name'].append(name)
+            table['Max'].append(values.max())
+            table['Min'].append(values.min())
+            table['Mean'].append(values.mean())
+            table['Median'].append(values.median())
+    print(pd.DataFrame(table))
+    print()
+
+    # Print data from flop-passes
+    print('FLOP related data:')
+    table = {'Counter Name': [], 'Raw Data': [], 'FLOP': [], 'Relative FLOP, %': []}
+    flop_passes = [1, 2, 3, 4]
+    for pass_id in flop_passes:
+        search_dir = os.path.join(metrics_dir, f'pass_{pass_id}')
+        metrics_file = find_file(search_dir, metrics_file_regex)
+        df = pd.read_csv(metrics_file)
+        df = filter(df, kernel_name)
+
+        curr_metrics = perf_metrics[pass_id - 1]
+        curr_metrics_names = list(curr_metrics.keys())
+        for name in curr_metrics_names:
+            data = df[df['Counter_Name'] == name]
+            value = data['Counter_Value'].mean()
+
+            num_flops = curr_metrics[name]['flop']
+            relative_value = 100 * num_flops / num_flop_sum
+            table['Counter Name'].append(name)
+            table['Raw Data'].append(value)
+            table['FLOP'].append(num_flops)
+            table['Relative FLOP, %'].append(relative_value)
+    print(pd.DataFrame(table))
+    print()
+
+    print('Performance info in TFLOP/s:')
+    performance = num_flop_sum / (timing * 1000)
+    print(performance.describe())
+    print()
+
+
+def main(args):
+    check_rocprofv3()
+
+    curr_dir = os.path.dirname(os.path.abspath(__file__))
+    metrics_spec = get_metrics_as_yaml()
+    metrics_spec_path = os.path.join(curr_dir, "metrics_spec.yaml")
+    with open(metrics_spec_path, 'w') as file:
+        file.write(metrics_spec)
+
+    metrics_dir = os.path.join(curr_dir, "metrics_dir")
+    timing_dir = os.path.join(curr_dir, "timing_dir")
+
+    if not args.display_only:
+        # test original command
+        if (args.verbose):
+            print('running original program...')
+        user_cmd = args.cmd
+        output = run_external_binary([], user_cmd, args.verbose)
+
+        if (args.verbose):
+            print(output)
+
+        # collect performance metrices
+        if (args.verbose):
+            print('running rocprofv3 passes...')
+
+        if os.path.exists(metrics_dir):
+            shutil.rmtree(metrics_dir)
+
+        rocprof_cmd = ['rocprofv3', '-i', metrics_spec_path, '-d', metrics_dir, '--']
+        output = run_external_binary([], rocprof_cmd + user_cmd, args.verbose)
+
+        if (args.verbose):
+            print(output)
+
+        # collect timing
+        if (args.verbose):
+            print('running rocprofv3 for timing info...')
+
+        if os.path.exists(timing_dir):
+            shutil.rmtree(timing_dir)
+
+        rocprof_cmd = ['rocprofv3', '--kernel-trace', '-d', timing_dir, '--']
+        output = run_external_binary([], rocprof_cmd + user_cmd, args.verbose)
+
+        if (args.verbose):
+            print(output)
+
+    process_files(metrics_dir, timing_dir, args.kernel, args.verbose)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-k", "--kernel", type=str, required=True, help="name of a kernel")
+    parser.add_argument('-c', '--cmd', required=True, nargs=argparse.REMAINDER, help='user command')
+    parser.add_argument("--display-only", action='store_true', help='display info without running')
+    parser.add_argument("-v", "--verbose", action='store_true', help='verbose output')
+    args = parser.parse_args()
+
+    main(args)