Parameter Sweep for Attention (#1830)

dorde-antic · web-flow · commit 2c251fd88672 · 2025-06-22T23:06:19.000+02:00
Implement parameterSweep for attention script which tests combinations of input shapes and perfConfigs for attention.

---------

Signed-off-by: Djordje Antic &lt;djoantic@amd.com&gt;
diff --git a/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp b/mlir/lib/Dialect/Rock/Tuning/RockTuningImpl.cpp
@@ -31,6 +31,7 @@
 namespace mlir {
 namespace rock {
 
+// Keep in sync with attentionSweeps.py
 // The full space is a brute-force search for attention kernels
 template <typename Op>
 static void createAttnTuningRangeBF(TuningParamSet *newSpace, Op attnOp,
diff --git a/mlir/utils/performance/attentionSweeps.py b/mlir/utils/performance/attentionSweeps.py
@@ -0,0 +1,218 @@
+#!/usr/bin/env python3
+"""Sweeps the parameters of the rocmlir driver for bugs for attention-based kernel configurations.
+
+Usage:
+    python3 attentionSweeps.py --mlir-build-dir <path-to-mlir-build-dir> [options]
+
+Options:
+    --mlir-build-dir    Path to the MLIR build directory (default: auto-detected)
+    --samples           Number of random configuration samples to the test (default: 1000)
+    --jobs              Number of concurrent tests to run in parallel (default: os.cpu_count())
+    --debug             Enable debug output
+    --quiet             Disable per-test result output
+    --log-failures      Save failing configurations to csv file
+"""
+import argparse
+import itertools
+import asyncio
+from typing import Iterable, List, TypeVar
+from datetime import datetime
+import sys
+import csv
+import random
+import os
+
+from perfRunner import AttentionConfiguration
+from perfRunner import getArch, getNumCU, initializeDataTypesAttention
+from perfRunner import create_paths as createPaths
+from perfRunner import find_mlir_build_dir as findMlirBuildDir
+from perfRunner import DATA_TYPES_ATTENTION, GFX_CHIP_RE
+from parameterSweeps import Options, sweepParameters, multilineRepr
+
+# GLOBAL VARIABLES
+DATA_TYPES_ATTENTION = initializeDataTypesAttention()
+BOOLS = [True, False]
+LOGFILE = 'failing_configs.csv'
+
+# Week number is used as seed to make sure weekly CI is reproducible
+seed = datetime.utcnow().isocalendar()[1]
+random.seed(seed)
+
+def toAttentionConfig(params, options: Options) -> AttentionConfiguration:
+    """Converts a sampled parameter tuple into a AttentionConfiguration instance."""
+    shape, perf = params
+    *shapeParams, currentSeqLen = shape
+    dtype, g, slq, slk, nhq, nhkv, hdqk, hdv, scale, bias, tq, tk, tv, to, causal, rlse = shapeParams
+    perfString = f"attn:v1:{','.join(str(x) for x in perf)}"
+    attnConfig = AttentionConfiguration(
+        dtype=dtype,
+        g=g,
+        seq_len_q=slq,
+        seq_len_k=slk,
+        num_heads_q=nhq,
+        num_heads_kv=nhkv,
+        head_dim_qk=hdqk,
+        head_dim_v=hdv,
+        with_attn_scale=scale,
+        with_attn_bias=bias,
+        transQ=tq,
+        transK=tk,
+        transV=tv,
+        transO=to,
+        causal=causal,
+        return_lse=rlse,
+        arch=options.arch,
+        numCU=options.numCu,
+        perf_config=perfString
+    )
+    attnConfig.currentSeqLen = currentSeqLen
+    return attnConfig
+
+IterType = TypeVar('IterType')
+def grouper(iterable: Iterable[IterType], n: int):
+    it = iter(iterable)
+    while True:
+        chunk = tuple(itertools.islice(it, n))
+        if not chunk:
+            return
+        yield chunk
+
+def genCurrentSeqLens(g: int, maxSeqLen: int) -> list[int]:
+    return [random.randint(0, maxSeqLen-1) for _ in range(g)]
+
+def sampleAttentionShape():
+    g = random.randint(1, 256) # GROUPS
+    seqLenK = random.randint(1, 16384) # SEQ_LEN_K 
+
+    useKVCache = random.choice(BOOLS)
+    currentSeqLen = genCurrentSeqLens(g, seqLenK) if useKVCache else None
+    seqLenQ = 1 if useKVCache else random.randint(1, 16384) # SEQ_LEN_Q
+
+    numHeadsQ = 1
+    numHeadsKV = 1
+    '''By default numHeadsQ and numHeadsKV are both 1. If numHeadsQ
+    and numHeadsKV are equal GQA is disabled. Both values are powers
+    of 2 typically. And numHeadsQ is divisible by numHeadsKV
+    Here we decide randomly if we will use numHeadsQ and numHeadsKV
+    different from the default values.
+    
+    Requirements:
+        - numHeadsQ >= numHeadsKV
+        - numHeadsQ % numHeadsKV == 0'''
+    genNumHeads = random.choice(BOOLS)
+    if genNumHeads:
+        while True:
+            numHeadsQ = 2**random.randint(1, 6)
+            numHeadsKV = 2**random.randint(1, 6)
+
+            if numHeadsQ > numHeadsKV and numHeadsQ%numHeadsKV == 0: # found valid case
+                break
+
+    return (
+        random.choice(DATA_TYPES_ATTENTION),
+        g, # GROUPS
+        seqLenQ, # SEQ_LEN_Q
+        seqLenK, # SEQ_LEN_K
+        numHeadsQ, # NUM_HEADS_Q
+        numHeadsKV, # NUM_HEADS_KV
+        random.randint(1, 1024), # HEAD_DIM_QK
+        random.randint(1, 1024), # HEAD_DIM_V
+        random.choice(BOOLS),   # with_attn_scale
+        random.choice(BOOLS),   # with_attn_bias
+        random.choice(BOOLS),   # transQ
+        random.choice(BOOLS),   # transK
+        random.choice(BOOLS),   # transV
+        random.choice(BOOLS),   # transO
+        random.choice(BOOLS),   # causal
+        random.choice(BOOLS),   # return_lse
+        currentSeqLen
+    )
+
+# Keep in sync with RockTuningImpl.cpp
+perfConfigSpaceMFMA = list(itertools.product( # MFMA perfConfig space
+        [32, 64, 128, 256], # M/block G0
+        [32, 64, 128, 256], # M/block G1
+        [32, 64, 128, 256], # N/block G0
+        [8, 16, 32, 64], # Kpack/Block
+        [32, 64, 128, 256], # M/Wave
+        [4, 16, 32], # MN/Xdl
+        [4, 8, 16], # kPack
+        [0, 1] # forceUnroll
+    ))
+
+perfConfigSpaceWMMA = list(itertools.product( # WMMA perfConfig space
+        [32, 64, 128],         # M/block G0
+        [32, 64, 128],         # M/block G1
+        [32, 64, 128, 256],    # N/block G0
+        [8, 16, 32, 64],       # Kpack/Block
+        [32, 64],              # M/Wave
+        [32, 64],              # N/Wave
+        [4, 8, 16],            # kPack
+        [0, 1]                 # forceUnroll
+    ))
+
+def logFailingConfigs(configs: List[AttentionConfiguration], filename: str):
+    with open(filename, mode='w', newline='') as csvfile:
+        writer = csv.writer(csvfile)
+        writer.writerow(['CommandLine'])
+        for config in configs:
+            writer.writerow([' '.join(config.generateMlirDriverCommandLine(''))])
+            
+def main():
+    parser = argparse.ArgumentParser(
+            description='Sweep parameter values for attention to detect bugs')
+    parser.add_argument('--debug', action='store_true')
+    parser.add_argument('--quiet', action='store_true')
+    parser.add_argument('--jobs', type=int, default=os.cpu_count())
+    parser.add_argument('--mlir-build-dir', type=str, default=findMlirBuildDir()),
+    parser.add_argument('--samples', type=int, default=1000)
+    parser.add_argument('--log-failures', action='store_true')
+
+    args = parser.parse_args()
+    arch = getArch()
+    chip_match = GFX_CHIP_RE.search(arch)
+    if chip_match is None:
+        raise RuntimeError(f"Could not find GFX chip in arch string: {arch}")
+    chip = chip_match.group(0)
+    paths = createPaths(None, args.mlir_build_dir)
+    options = Options(
+        debug=args.debug,
+        quiet=args.quiet,
+        arch=arch,
+        flags=[],
+        concurrent_tests=args.jobs,
+        numCu=getNumCU(chip)
+    )
+   
+
+    if not args.quiet:
+        print(f"Sampling {args.samples} configurations from attention space...")
+
+    # TODO: use AmdArchDb python version when available
+    
+    if chip.startswith('gfx9'):
+        perfConfigSpace = perfConfigSpaceMFMA
+    else:
+        perfConfigSpace = perfConfigSpaceWMMA
+
+    samples = [
+        (sampleAttentionShape(), random.choice(perfConfigSpace))
+        for _ in range(args.samples)
+    ]
+
+    passed, invalid, failing = asyncio.run(sweepParameters(samples, toAttentionConfig, options, paths))
+    if failing:
+        print("\n" + "-" * 80)
+        print(f"{'Failing Configurations':^80}\n")
+        for fail in failing:
+            print(multilineRepr(fail))
+        if args.log_failures:
+            logFailingConfigs(failing, LOGFILE)
+    
+    print(f"\nPassed: {passed}, Invalid: {invalid}, Failed: {len(failing)}")
+    
+    return 0
+
+if __name__ == '__main__':
+    ret = main()
+    sys.exit(ret)
diff --git a/mlir/utils/performance/parameterSweeps.py b/mlir/utils/performance/parameterSweeps.py
@@ -34,6 +34,7 @@ class Options:
     arch: str
     flags: list
     concurrent_tests: int
+    numCu: int
 
 class PerfConfig:
     class Version(enum.Enum):
@@ -137,16 +138,61 @@ def __init__(self, dtype: str, direction: str, layout: str,
         self.ho = math.floor((self.hi + self.paddingHL + self.paddingHR - (self.y - 1) * self.dilationH - 1 ) / self.convStrideH) + 1
         self.wo = math.floor((self.wi + self.paddingWL + self.paddingWR * 2 - (self.x - 1) * self.dilationW - 1 ) / self.convStrideW) + 1
 
+def multilineRepr(obj, num_fields=4):
+    """ Returns a multi-line string representation of the given object,
+    inserting a newline after every defined number of comma-separated
+    fields in its repr(). Useful for making long configuration 
+    representations more readable in logs or debug output."""
+    s = repr(obj).replace('\n', ' ')  # Flatten to one line
+    lines = []
+    field = ''
+    fields = []
+    in_quotes = False
+    perf_config_str = None
+
+    i = 0
+    while i < len(s):
+        # Detect start of perf_config to prevent it from being split
+        if s.startswith('perf_config=', i):
+            perf_config_str = s[i:]
+            break
+        c = s[i]
+        if c == "'":
+            in_quotes = not in_quotes
+            field += c
+        elif c == ',' and not in_quotes:
+            fields.append(field.strip() + ',')
+            field = ''
+        else:
+            field += c
+        i += 1
+    if field:
+        fields.append(field.strip())
+    for j in range(0, len(fields), num_fields):
+        prefix = '\t' if j > 0 else ''
+        group = fields[j:j+num_fields]
+        if j + num_fields >= len(fields) and group and group[-1].endswith(','):
+            group[-1] = group[-1][:-1]
+        lines.append(f"{prefix}{' '.join(group)}")
+    if perf_config_str:
+        lines.append('\t' + perf_config_str.strip())
+        
+    return '\n'.join(lines)
 
 class TestResult(enum.Enum):
     PASS = 1
     INVALID = 2
     FAIL = 3
 
-async def testConfig(config: MLIROnlyConfig, options: Options, paths: Paths) -> TestResult:
+async def testConfig(config, options: Options, paths: Paths) -> TestResult:
     """Runs the given configuration and returns whether it successfully concluded,
     failed validation, or was inapplicable."""
-    rocmlirGenOpts = config.generateMlirDriverCommandLine(options.flags)
+    if isinstance(config, MLIROnlyConfig):
+        rocmlirGenOpts = config.generateMlirDriverCommandLine(options.flags)
+    else:
+        rocmlirGenOpts = config.generateMlirDriverCommandLine(' '.join(options.flags)).split()
+        if getattr(config, "currentSeqLen") is not None:
+            rocmlirGenOpts.append(f"--current_seq_len={','.join(map(str, config.currentSeqLen))}")
     rocmlirGenOpts.append('-pv')
 
     applicableFromGen, genToApplicable = os.pipe()
@@ -218,7 +264,7 @@ async def testConfig(config: MLIROnlyConfig, options: Options, paths: Paths) ->
         return TestResult.FAIL
 
     if not CORRECT_RESULT_RE.search(runnerOut):
-        print(f"""Convolution returned intorrect result
+        print(f"""Config returned incorrect result
 Output = {runnerOut}
 Errors = {runnerErrs.decode('utf-8')}""", file=sys.stderr)
         return TestResult.FAIL
@@ -233,20 +279,22 @@ def grouper(iterable: Iterable[IterType], n: int):
             return
         yield chunk
 
-async def dropGoodConfig(config: ConvConfiguration,
-        options: Options, paths: Paths) -> Union[TestResult, ConvConfiguration]:
+async def dropGoodConfig(config, options: Options, paths: Paths):
     """Test the given `params`, returning the corresponding `config` on failure
     and `None` on success or inapplicability"""
     result = await testConfig(config, options, paths)
     if not options.quiet:
-        print(f"{result.name}: {config!r}")
+        if isinstance(config, MLIROnlyConfig):
+            print(f"{result.name}: {config!r}")
+        else:
+            print(f"{result.name}: {multilineRepr(config)}")
     if result == TestResult.FAIL:
         return config
     return result
 
 async def sweepParameters(paramIter: Iterable[IterType],
-        toConfig: Callable[[IterType, Options], MLIROnlyConfig],
-        options: Options, paths: Paths) -> Tuple[int, int, List[MLIROnlyConfig]]:
+        toConfig: Callable[[IterType, Options], PerfConfig],
+        options: Options, paths: Paths) -> Tuple[int, int, List[PerfConfig]]:
     failingConfigs = []
     passed = 0
     invalid = 0
diff --git a/mlir/utils/performance/perfRunner.py b/mlir/utils/performance/perfRunner.py
@@ -155,6 +155,8 @@ def initializeDataTypesAttention():
     else:
         DATA_TYPES_ATTENTION = DATA_TYPES_ATTENTION_WMMA
         
+    return DATA_TYPES_ATTENTION # For modules that import this function
+
 def create_paths(config_file_path, mlir_build_dir_path) -> Paths:
     """Creates the composite Paths structure using build dir paths"""