ROCm · ravil-mobile · Apr 15, 2025 · Apr 15, 2025 · Apr 16, 2025 · Apr 23, 2025
diff --git a/.github/workflows/integration-tests-amd.yml b/.github/workflows/integration-tests-amd.yml
@@ -109,7 +109,7 @@ jobs:
             echo "Could not find '${INSTRUMENTATION_LIB_DIR}'" ; exit -1
           fi
           pytest --capture=tee-sys -rfs python/tutorials/06-fused-attention.py
-          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice.py
+          pytest --capture=tee-sys -rfs third_party/amd/python/test/test_extract_slice_concat_op.py
           TRITON_ALWAYS_COMPILE=1 pytest --capture=tee-sys -rfs third_party/amd/python/test/test_scalarize_packed_fops.py
           cd python/test/unit
           pytest --capture=tee-sys -rfs -n 12 language runtime \

diff --git a/cmake/llvm-hash.txt b/cmake/llvm-hash.txt
@@ -1 +1 @@
-3c709802d31b5bc5ed3af8284b40593ff39b9eec
+092b6e73e651469527662443b592f98f442ece72
diff --git a/fa/flash-attention.py b/fa/flash-attention.py
diff --git a/fa/model_configs.json b/fa/model_configs.json
@@ -0,0 +1,42 @@
+{
+  "llama3": {
+    "8B": {
+      "num_attention_heads": 32,
+      "num_key_value_heads": 8,
+      "hidden_size": 4096,
+      "intermediate_size": 14336,
+      "vocab_size": 128256
+    },
+    "70B": {
+      "num_attention_heads": 64,
+      "num_key_value_heads": 8,
+      "hidden_size": 8192,
+      "intermediate_size": 28672,
+      "vocab_size": 128256
+    },
+    "405B": {
+      "num_attention_heads": 128,
+      "num_key_value_heads": 8,
+      "hidden_size": 16384,
+      "intermediate_size": 53248,
+      "vocab_size": 128256
+    }
+  },
+  "mistral": {
+    "7B": {
+      "hidden_size": 4096,
+      "intermediate_size": 14336,
+      "num_attention_heads": 32,
+      "num_key_value_heads": 8,
+      "vocab_size": 32000
+    },
+    "22B": {
+      "hidden_size": 6144,
+      "intermediate_size": 16384,
+      "num_attention_heads": 48,
+      "num_key_value_heads": 8,
+      "vocab_size": 32000
+    }
+
+  }
+}
diff --git a/fa/utils/__init__.py b/fa/utils/__init__.py
diff --git a/fa/utils/benchmark_utils.py b/fa/utils/benchmark_utils.py
@@ -0,0 +1,71 @@
+import os
+import json
+
+# Base directory where configs are located
+BASE_DIR = os.path.abspath(os.path.join(os.path.dirname(__file__), "../"))
+
+
+def get_model_configs(config_path='model_configs.json', model_families=["llama3"], model="all"):
+    """
+    Load model names from the configuration file.
+
+    Args:
+        config_path (str): User-provided path to the configuration JSON file.
+        model_families (list): List of model family names to retrieve.
+
+    Returns:
+        dict: A dictionary of available models and their configurations for the specified families.
+    """
+    # Resolve config path relative to ./perf-kernels/
+    config_path = os.path.join(BASE_DIR, config_path)
+
+    with open(config_path, 'r') as f:
+        configs = json.load(f)
+
+    # Extract models and their configurations for the specified families
+    filtered_configs = {}
+
+    for family in model_families:
+        if family in configs:
+            # Check if model filtering is required
+            if model == "all":
+                # Include all models in the family
+                for model_size, model_configs in configs[family].items():
+                    filtered_configs[f"{family}-{model_size}"] = model_configs
+            else:
+                # Parse the model string (e.g., llama3_8B or llama3-8B)
+                delimiter = "_" if "_" in model else "-"
+                model_parts = model.split(delimiter)
+
+                # Check if the family and size match
+                if len(model_parts) == 2 and model_parts[0] == family:
+                    model_size = model_parts[1]
+                    if model_size in configs[family]:
+                        filtered_configs[f"{family}-{model_size}"] = configs[family][model_size]
+
+    if not filtered_configs:
+        print(f"Warning: No models selected for families: {model_families} with filter: '{model}'")
+
+    return filtered_configs
+
+
+def get_available_models(config_file='model_configs.json', model_families=["llama3"]):
+    """
+    Load model names from the configuration file.
+
+    Args:
+        config_file (str): Path to the configuration JSON file.
+        model_families (list): List of model family names to retrieve.
+
+    Returns:
+        list: A list of available models for the specified families.
+    """
+    # Resolve config path relative to ./perf-kernels/
+    config_path = os.path.join(BASE_DIR, config_file)
+
+    with open(config_path, 'r') as f:
+        configs = json.load(f)
+
+    models = [f"{family}-{model}" for family in model_families if family in configs for model in configs[family]]
+
+    return models
diff --git a/fa/utils/rocprof_benchmark.py b/fa/utils/rocprof_benchmark.py
@@ -0,0 +1,59 @@
+import subprocess
+import os
+import pandas as pd
+from prettytable import PrettyTable
+
+
+def run_profiling(triton_dir, batch_size, output_file):
+    command = [
+        "rocprof", "--stats", "-o", output_file, "python", f"{triton_dir}/python/perf-kernels/MLA_decode_rope.py", "-B",
+        str(batch_size), "-dtype", "bf16", "-use_rope"
+    ]
+    subprocess.run(command, check=True)
+
+
+def parse_profiling_output(output_file, kernel_names):
+    df = pd.read_csv(output_file)
+    results = {}
+    for kernel in kernel_names:
+        kernel_data = df[df['Name'].str.strip('"') == kernel]
+        if not kernel_data.empty:
+            results[kernel] = kernel_data['AverageNs'].iloc[0] / 1000.0
+        else:
+            results[kernel] = None
+
+    # Calculate sum of other kernels
+    other_kernels = df[~df['Name'].str.strip('"').isin(kernel_names)]
+    other_kernels_sum = other_kernels['AverageNs'].sum() / 1000.0
+    results['other_kernels_sum'] = other_kernels_sum
+
+    return results
+
+
+def main():
+    triton_dir = os.environ.get("TRITONDIR", "~/triton")  # Default to ~/triton if not set
+    output_file = os.path.expanduser("~/profiling.csv")
+    kernel_names = ["_fwd_grouped_kernel_stage1_rope.kd", "_fwd_grouped_kernel_stage1.kd"]
+    batch_sizes = [1, 4, 32, 64, 128]
+
+    results = {B: {} for B in batch_sizes}
+    for B in batch_sizes:
+        print(f"Running profiling for B={B}...")
+        run_profiling(triton_dir, B, output_file)
+        output_stats_file = os.path.expanduser("~/profiling.stats.csv")
+        kernel_results = parse_profiling_output(output_stats_file, kernel_names)
+        results[B] = kernel_results
+
+    table = PrettyTable()
+    table.field_names = ["B"] + kernel_names + ["Other Kernels Sum (µs)"]
+    for B in batch_sizes:
+        row = [B] + [results[B].get(kernel, "N/A")
+                     for kernel in kernel_names] + [results[B].get('other_kernels_sum', "N/A")]
+        table.add_row(row)
+
+    print("\nProfiling Summary (in microseconds):")
+    print(table)
+
+
+if __name__ == "__main__":
+    main()
Original file line number	Diff line number	Diff line change
		@@ -1 +1 @@
		3c709802d31b5bc5ed3af8284b40593ff39b9eec
		092b6e73e651469527662443b592f98f442ece72