[BENCHMARK] Add llama3.1 third party benchmark (#4798)

jakub-sochacki · web-flow · commit c01b6ec34862 · 2025-07-31T14:39:47.000+02:00
- Added a step "Run Llama 3.1 performance benchmark" in "third-party-benchmarks.yml" - Benchmark "flex_attention-pr-check" is cloned from [external repository](https://github.com/LiyangLingIntel/flex_attention-pr-check) - Enabled option to run only selected benchmark through a GitHub workflow - The test uploads CSV artifact
diff --git a/.github/workflows/third-party-benchmarks.yml b/.github/workflows/third-party-benchmarks.yml
@@ -11,6 +11,10 @@ on:
         description: Tag for benchmark results
         type: string
         default: "test"
+      benchmarks:
+        description: JSON list of benchmarks to run. Leave empty to run all benchmarks.
+        type: string
+        default: ""
       use_pyenv_python:
         description: Use Python built with pyenv
         type: boolean
@@ -24,6 +28,7 @@ permissions: read-all
 env:
   PYTHON_VERSION: "3.10"
   TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
+  HF_TOKEN: ${{ secrets.HF_TOKEN || '' }}
 
 jobs:
   build:
@@ -83,7 +88,7 @@ jobs:
           echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
 
       - name: Run Liger-Kernel benchmarks
-        if: ${{ steps.install.outcome == 'success' && !cancelled() }}
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'liger-kernel')) }}
         run: |
           source ./scripts/capture-hw-details.sh
 
@@ -102,6 +107,40 @@ jobs:
           # Return the captured return code at the end
           exit "$RET_CODE"
 
+      - name: Run e2e Llama 3.1 flex attention performance benchmark
+        if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'llama3-1')) }}
+        run: |
+          source ./scripts/capture-hw-details.sh
+
+          git clone https://github.com/huggingface/transformers.git
+
+          cd transformers
+
+          git checkout $(<../benchmarks/third_party/e2e-flex_attention/transformers-commit.txt)
+          git apply ../benchmarks/third_party/e2e-flex_attention/transformers-patch-for-timing.diff
+
+          git submodule sync
+          git submodule update --init --recursive
+          python setup.py develop
+
+          cd ../benchmarks/third_party/e2e-flex_attention
+
+          MODEL_NAME="meta-llama/Llama-3.1-8B"
+          MAX_NEW_TOKENS=128
+          INPUT_TOKENS=1024
+          BATCH_SIZE=1
+
+          python run_llm_inductor_greedy.py -m $MODEL_NAME --max-new-tokens $MAX_NEW_TOKENS --input-tokens $INPUT_TOKENS --num-warmup 2 --num-iter 7 --compile --profile | tee llm.compile.xpu.profile.log
+
+          echo "LLM profiling log is stored into $PWD/llm.compile.xpu.profile.log"
+
+          cp llm.compile.xpu.profile.log $REPORTS/llm.compile.xpu.profile.log
+          python transform_results.py $REPORTS/llm.compile.xpu.profile.log $REPORTS/llm-triton-report.csv \
+            --tag $TAG \
+            --model "$MODEL_NAME" \
+            --max-new-tokens $MAX_NEW_TOKENS \
+            --batch-size $BATCH_SIZE
+
       - name: Upload benchmark reports
         if: ${{ steps.install.outcome == 'success' && !cancelled() }}
         uses: actions/upload-artifact@v4
diff --git a/benchmarks/third_party/e2e-flex_attention/prompt.json b/benchmarks/third_party/e2e-flex_attention/prompt.json
diff --git a/benchmarks/third_party/e2e-flex_attention/run_llm_inductor_greedy.py b/benchmarks/third_party/e2e-flex_attention/run_llm_inductor_greedy.py
@@ -0,0 +1,161 @@
+# coding=utf-8
+# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import pathlib
+import time
+from itertools import chain
+
+import numpy as np
+import torch
+from transformers import (
+    AutoModelForCausalLM,
+    AutoTokenizer,
+)
+
+
+def trace_handler(profile_obj):
+    print(profile_obj.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
+
+
+parser = argparse.ArgumentParser("LLM generation (greedy search) script for inductor torch.compile path",
+                                 add_help=False)
+parser.add_argument(
+    "-m",
+    "--model-name-or-path",
+    default="meta-llama/Llama-2-7b-hf",
+    type=str,
+    help="path to model or model name in HF hub",
+)
+parser.add_argument(
+    "--dtype",
+    type=str,
+    choices=["fp32", "bf16", "fp16"],
+    help="bf16 or fp32",
+    default="bf16",
+)
+parser.add_argument("--max-new-tokens", default=32, type=int, help="output max new tokens")
+parser.add_argument("--input-tokens", default="32", type=str)
+parser.add_argument("--page-size", default=32, type=int)
+parser.add_argument("--prompt", default=None, type=str)
+parser.add_argument("--num-iter", default=1, type=int, help="num iter")
+parser.add_argument("--num-warmup", default=0, type=int, help="num warmup")
+parser.add_argument("--batch-size", default=1, type=int, help="batch size")
+parser.add_argument("--device", default="xpu", type=str)
+parser.add_argument("--profile", action="store_true")
+parser.add_argument("--compile", action="store_true")
+args = parser.parse_args()
+
+if args.dtype == "bf16":
+    amp_enabled = True
+    load_dtype = torch.bfloat16
+elif args.dtype == "fp32":
+    amp_enabled = False
+    load_dtype = torch.float
+elif args.dtype == "fp16":
+    amp_enabled = True
+    load_dtype = torch.float16
+else:
+    assert False, "This script only support bf16 and fp32 as dtype"
+
+attn_type = "flex_attention"
+tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
+model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=load_dtype,
+                                             attn_implementation=attn_type).to(args.device)
+if attn_type == "paged_attention":
+    model.generation_config.cache_implementation = "paged"
+    model.config.page_size = args.page_size
+
+if args.compile:
+    with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
+        print("compile Enabled")
+        model.forward = torch.compile(model.forward, dynamic=True)
+
+# greedy search
+generate_kwargs = {
+    "do_sample": False,
+    "temperature": 0.9,
+    "num_beams": 1,
+    "token_latency": True,
+}
+current_path = pathlib.Path(__file__).parent.resolve()
+if args.prompt is not None:
+    prompt = args.prompt
+else:
+    with open(str(current_path) + "/prompt.json", encoding="utf-8") as f:
+        prompt_pool = json.load(f)
+    if "llama" in prompt_pool and args.input_tokens in prompt_pool["llama"]:
+        prompt = prompt_pool["llama"]["2048"]
+    else:
+        raise SystemExit(
+            "[ERROR] No such input_tokens prompt in prompt.json, Please use --prompt if want to use custom input.")
+
+prompt = [prompt] * args.batch_size
+inputs = tokenizer(prompt, return_tensors="pt", max_length=int(args.input_tokens))
+input_ids = inputs.input_ids.to(args.device)
+attention_mask = inputs.attention_mask.to(args.device)
+
+input_size = input_ids.size(dim=1)
+print(f"---- Prompt size: {input_size}")
+
+# warmup
+with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
+    for _ in range(args.num_warmup):
+        model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=args.max_new_tokens, **generate_kwargs)
+
+if args.profile:
+    with torch.profiler.profile(activities=[
+            torch.profiler.ProfilerActivity.CPU,
+            torch.profiler.ProfilerActivity.XPU,
+    ], schedule=torch.profiler.schedule(wait=0, warmup=2, active=5), on_trace_ready=trace_handler,
+                                record_shapes=True) as prof:
+        with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
+            for i in range(7):
+
+                model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=args.max_new_tokens,
+                               **generate_kwargs)
+                prof.step()
+# benchmark
+num_iter = args.num_iter - args.num_warmup
+total_time = 0.0
+total_list = []
+gen_text = None
+with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
+    for _ in range(num_iter):
+        torch.xpu.synchronize()
+        tic = time.time()
+        output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=args.max_new_tokens,
+                                **generate_kwargs)
+        gen_ids = output[0]
+        gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
+        torch.xpu.synchronize()
+        toc = time.time()
+        total_time += toc - tic
+        total_list.append(output[1])
+
+print(gen_text, flush=True)
+print("\n", "-" * 10, "Summary:", "-" * 10)
+latency = total_time / num_iter
+print(f"inference-latency: {latency:.3f} sec.")
+first_latency = np.mean([x[0] for x in total_list])
+if args.max_new_tokens > 1:
+    next_latency_list = list(chain(*[x[1:] for x in total_list]))
+    next_latency_list.sort()
+    average_next_latency = np.mean(next_latency_list)
+    p90_latency = np.percentile(next_latency_list, 90)
+print(f"first-token-latency: {first_latency:.3f} sec.")
+if args.max_new_tokens > 1:
+    print(f"rest-token-latency: {average_next_latency:.3f} sec.")
+    print(f"P90-rest-token-latency: {p90_latency:.3f} sec.")
diff --git a/benchmarks/third_party/e2e-flex_attention/transform_results.py b/benchmarks/third_party/e2e-flex_attention/transform_results.py
@@ -0,0 +1,115 @@
+import argparse
+import re
+import os
+import uuid
+import json
+from datetime import datetime
+
+import pandas as pd
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Parse LLM profiling log')
+    parser.add_argument('log_file', help='Path to the LLM profiling log file')
+    parser.add_argument('output_csv', help='Path to output CSV file')
+    parser.add_argument('--tag', help='Tag for the benchmark run', default='')
+    parser.add_argument('--model', help='Model name', default='unknown-model')
+    parser.add_argument('--max-new-tokens', type=int, help='Maximum new tokens', default=128)
+    parser.add_argument('--batch-size', type=int, help='Batch size', default=1)
+
+    return parser.parse_args()
+
+
+def parse_llm_log(log_file_path, tag, model, max_new_tokens, batch_size):
+    """Parse the LLM profiling log and extract performance metrics."""
+
+    with open(log_file_path, 'r', encoding='utf-8') as f:
+        content = f.read()
+
+    metrics = {}
+
+    inference_match = re.search(r'inference-latency:\s+([\d.]+)\s+sec\.', content)
+    if inference_match:
+        metrics['inference_latency'] = float(inference_match.group(1))
+
+    first_token_match = re.search(r'first-token-latency:\s+([\d.]+)\s+sec\.', content)
+    if first_token_match:
+        metrics['first_token_latency'] = float(first_token_match.group(1))
+
+    rest_token_match = re.search(r'rest-token-latency:\s+([\d.]+)\s+sec\.', content)
+    if rest_token_match:
+        metrics['rest_token_latency'] = float(rest_token_match.group(1))
+
+    p90_match = re.search(r'P90-rest-token-latency:\s+([\d.]+)\s+sec\.', content)
+    if p90_match:
+        metrics['p90_rest_token_latency'] = float(p90_match.group(1))
+
+    prompt_match = re.search(r'Prompt size:\s+(\d+)', content)
+    prompt_size = int(prompt_match.group(1)) if prompt_match else 1024
+
+    params = {
+        'model': model,
+        'input_tokens': prompt_size,
+        'max_new_tokens': max_new_tokens,
+        'batch_size': batch_size,
+    }
+    params_json = json.dumps(params)
+
+    rows = []
+    run_uuid = uuid.uuid4().hex
+    current_datetime = datetime.now().isoformat()
+
+    # Create one row for each metric
+    for metric_name, metric_value in metrics.items():
+        row = {
+            'benchmark': 'e2e-flex-attention',
+            'run_uuid': run_uuid,
+            'datetime': current_datetime,
+            'compiler': 'triton',
+            'metric_name': metric_name,
+            'metric_value': metric_value,
+            'params': params_json,
+            'tag': tag,
+        }
+        rows.append(row)
+
+    df_results = pd.DataFrame(rows)
+
+    host_info = {
+        n: os.getenv(n.upper(), default='')
+        for n in [
+            'libigc1_version',
+            'level_zero_version',
+            'gpu_device',
+            'agama_version',
+            'torch_version',
+            'compiler_version',
+            'benchmarking_method',
+        ]
+    }
+    if not host_info['gpu_device']:
+        raise RuntimeError('Could not find GPU device description, was `capture-hw-details.sh` called?')
+
+    for name, val in host_info.items():
+        df_results[name] = val
+
+    print(f'Extracted metrics: {json.dumps(metrics, indent=2)}')
+    print(f'DataFrame shape: {df_results.shape}')
+
+    return df_results
+
+
+def main():
+    args = parse_args()
+    if not os.path.exists(args.log_file):
+        print(f'Error: Log file {args.log_file} not found')
+        return 1
+
+    df_results = parse_llm_log(args.log_file, args.tag, args.model, args.max_new_tokens, args.batch_size)
+    df_results.to_csv(args.output_csv, index=False)
+    print(f'Transformed CSV saved to {args.output_csv}')
+    return 0
+
+
+if __name__ == '__main__':
+    main()
diff --git a/benchmarks/third_party/e2e-flex_attention/transformers-commit.txt b/benchmarks/third_party/e2e-flex_attention/transformers-commit.txt
@@ -0,0 +1 @@
+6e9972962fbc80d218234bfbd8c9b2843ef02b2b
diff --git a/benchmarks/third_party/e2e-flex_attention/transformers-patch-for-timing.diff b/benchmarks/third_party/e2e-flex_attention/transformers-patch-for-timing.diff

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+6e9972962fbc80d218234bfbd8c9b2843ef02b2b`