Skip to content

Commit c01b6ec

Browse files
[BENCHMARK] Add llama3.1 third party benchmark (#4798)
- Added a step "Run Llama 3.1 performance benchmark" in "third-party-benchmarks.yml" - Benchmark "flex_attention-pr-check" is cloned from [external repository](https://github.com/LiyangLingIntel/flex_attention-pr-check) - Enabled option to run only selected benchmark through a GitHub workflow - The test uploads CSV artifact
1 parent 7a02467 commit c01b6ec

File tree

6 files changed

+467
-1
lines changed

6 files changed

+467
-1
lines changed

.github/workflows/third-party-benchmarks.yml

Lines changed: 40 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@ on:
1111
description: Tag for benchmark results
1212
type: string
1313
default: "test"
14+
benchmarks:
15+
description: JSON list of benchmarks to run. Leave empty to run all benchmarks.
16+
type: string
17+
default: ""
1418
use_pyenv_python:
1519
description: Use Python built with pyenv
1620
type: boolean
@@ -24,6 +28,7 @@ permissions: read-all
2428
env:
2529
PYTHON_VERSION: "3.10"
2630
TAG: ${{ inputs.tag || (github.event_name == 'pull_request' && format('pr-{0}', github.event.number)) || (github.event_name == 'schedule' && 'ci') || 'test' }}
31+
HF_TOKEN: ${{ secrets.HF_TOKEN || '' }}
2732

2833
jobs:
2934
build:
@@ -83,7 +88,7 @@ jobs:
8388
echo "REPORTS=$PWD/reports" >> $GITHUB_ENV
8489
8590
- name: Run Liger-Kernel benchmarks
86-
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
91+
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'liger-kernel')) }}
8792
run: |
8893
source ./scripts/capture-hw-details.sh
8994
@@ -102,6 +107,40 @@ jobs:
102107
# Return the captured return code at the end
103108
exit "$RET_CODE"
104109
110+
- name: Run e2e Llama 3.1 flex attention performance benchmark
111+
if: ${{ steps.install.outcome == 'success' && !cancelled() && (inputs.benchmarks == '' || contains(fromJson(inputs.benchmarks || '[]'), 'llama3-1')) }}
112+
run: |
113+
source ./scripts/capture-hw-details.sh
114+
115+
git clone https://github.com/huggingface/transformers.git
116+
117+
cd transformers
118+
119+
git checkout $(<../benchmarks/third_party/e2e-flex_attention/transformers-commit.txt)
120+
git apply ../benchmarks/third_party/e2e-flex_attention/transformers-patch-for-timing.diff
121+
122+
git submodule sync
123+
git submodule update --init --recursive
124+
python setup.py develop
125+
126+
cd ../benchmarks/third_party/e2e-flex_attention
127+
128+
MODEL_NAME="meta-llama/Llama-3.1-8B"
129+
MAX_NEW_TOKENS=128
130+
INPUT_TOKENS=1024
131+
BATCH_SIZE=1
132+
133+
python run_llm_inductor_greedy.py -m $MODEL_NAME --max-new-tokens $MAX_NEW_TOKENS --input-tokens $INPUT_TOKENS --num-warmup 2 --num-iter 7 --compile --profile | tee llm.compile.xpu.profile.log
134+
135+
echo "LLM profiling log is stored into $PWD/llm.compile.xpu.profile.log"
136+
137+
cp llm.compile.xpu.profile.log $REPORTS/llm.compile.xpu.profile.log
138+
python transform_results.py $REPORTS/llm.compile.xpu.profile.log $REPORTS/llm-triton-report.csv \
139+
--tag $TAG \
140+
--model "$MODEL_NAME" \
141+
--max-new-tokens $MAX_NEW_TOKENS \
142+
--batch-size $BATCH_SIZE
143+
105144
- name: Upload benchmark reports
106145
if: ${{ steps.install.outcome == 'success' && !cancelled() }}
107146
uses: actions/upload-artifact@v4

benchmarks/third_party/e2e-flex_attention/prompt.json

Lines changed: 15 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 161 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,161 @@
1+
# coding=utf-8
2+
# Copyright 2020 The HuggingFace Inc. team. All rights reserved.
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
import argparse
16+
import json
17+
import pathlib
18+
import time
19+
from itertools import chain
20+
21+
import numpy as np
22+
import torch
23+
from transformers import (
24+
AutoModelForCausalLM,
25+
AutoTokenizer,
26+
)
27+
28+
29+
def trace_handler(profile_obj):
30+
print(profile_obj.key_averages().table(sort_by="self_xpu_time_total", row_limit=-1))
31+
32+
33+
parser = argparse.ArgumentParser("LLM generation (greedy search) script for inductor torch.compile path",
34+
add_help=False)
35+
parser.add_argument(
36+
"-m",
37+
"--model-name-or-path",
38+
default="meta-llama/Llama-2-7b-hf",
39+
type=str,
40+
help="path to model or model name in HF hub",
41+
)
42+
parser.add_argument(
43+
"--dtype",
44+
type=str,
45+
choices=["fp32", "bf16", "fp16"],
46+
help="bf16 or fp32",
47+
default="bf16",
48+
)
49+
parser.add_argument("--max-new-tokens", default=32, type=int, help="output max new tokens")
50+
parser.add_argument("--input-tokens", default="32", type=str)
51+
parser.add_argument("--page-size", default=32, type=int)
52+
parser.add_argument("--prompt", default=None, type=str)
53+
parser.add_argument("--num-iter", default=1, type=int, help="num iter")
54+
parser.add_argument("--num-warmup", default=0, type=int, help="num warmup")
55+
parser.add_argument("--batch-size", default=1, type=int, help="batch size")
56+
parser.add_argument("--device", default="xpu", type=str)
57+
parser.add_argument("--profile", action="store_true")
58+
parser.add_argument("--compile", action="store_true")
59+
args = parser.parse_args()
60+
61+
if args.dtype == "bf16":
62+
amp_enabled = True
63+
load_dtype = torch.bfloat16
64+
elif args.dtype == "fp32":
65+
amp_enabled = False
66+
load_dtype = torch.float
67+
elif args.dtype == "fp16":
68+
amp_enabled = True
69+
load_dtype = torch.float16
70+
else:
71+
assert False, "This script only support bf16 and fp32 as dtype"
72+
73+
attn_type = "flex_attention"
74+
tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
75+
model = AutoModelForCausalLM.from_pretrained(args.model_name_or_path, torch_dtype=load_dtype,
76+
attn_implementation=attn_type).to(args.device)
77+
if attn_type == "paged_attention":
78+
model.generation_config.cache_implementation = "paged"
79+
model.config.page_size = args.page_size
80+
81+
if args.compile:
82+
with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
83+
print("compile Enabled")
84+
model.forward = torch.compile(model.forward, dynamic=True)
85+
86+
# greedy search
87+
generate_kwargs = {
88+
"do_sample": False,
89+
"temperature": 0.9,
90+
"num_beams": 1,
91+
"token_latency": True,
92+
}
93+
current_path = pathlib.Path(__file__).parent.resolve()
94+
if args.prompt is not None:
95+
prompt = args.prompt
96+
else:
97+
with open(str(current_path) + "/prompt.json", encoding="utf-8") as f:
98+
prompt_pool = json.load(f)
99+
if "llama" in prompt_pool and args.input_tokens in prompt_pool["llama"]:
100+
prompt = prompt_pool["llama"]["2048"]
101+
else:
102+
raise SystemExit(
103+
"[ERROR] No such input_tokens prompt in prompt.json, Please use --prompt if want to use custom input.")
104+
105+
prompt = [prompt] * args.batch_size
106+
inputs = tokenizer(prompt, return_tensors="pt", max_length=int(args.input_tokens))
107+
input_ids = inputs.input_ids.to(args.device)
108+
attention_mask = inputs.attention_mask.to(args.device)
109+
110+
input_size = input_ids.size(dim=1)
111+
print(f"---- Prompt size: {input_size}")
112+
113+
# warmup
114+
with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
115+
for _ in range(args.num_warmup):
116+
model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=args.max_new_tokens, **generate_kwargs)
117+
118+
if args.profile:
119+
with torch.profiler.profile(activities=[
120+
torch.profiler.ProfilerActivity.CPU,
121+
torch.profiler.ProfilerActivity.XPU,
122+
], schedule=torch.profiler.schedule(wait=0, warmup=2, active=5), on_trace_ready=trace_handler,
123+
record_shapes=True) as prof:
124+
with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
125+
for i in range(7):
126+
127+
model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=args.max_new_tokens,
128+
**generate_kwargs)
129+
prof.step()
130+
# benchmark
131+
num_iter = args.num_iter - args.num_warmup
132+
total_time = 0.0
133+
total_list = []
134+
gen_text = None
135+
with torch.no_grad(), torch.autocast(enabled=amp_enabled, device_type=args.device, dtype=load_dtype):
136+
for _ in range(num_iter):
137+
torch.xpu.synchronize()
138+
tic = time.time()
139+
output = model.generate(input_ids, attention_mask=attention_mask, max_new_tokens=args.max_new_tokens,
140+
**generate_kwargs)
141+
gen_ids = output[0]
142+
gen_text = tokenizer.batch_decode(gen_ids, skip_special_tokens=True)
143+
torch.xpu.synchronize()
144+
toc = time.time()
145+
total_time += toc - tic
146+
total_list.append(output[1])
147+
148+
print(gen_text, flush=True)
149+
print("\n", "-" * 10, "Summary:", "-" * 10)
150+
latency = total_time / num_iter
151+
print(f"inference-latency: {latency:.3f} sec.")
152+
first_latency = np.mean([x[0] for x in total_list])
153+
if args.max_new_tokens > 1:
154+
next_latency_list = list(chain(*[x[1:] for x in total_list]))
155+
next_latency_list.sort()
156+
average_next_latency = np.mean(next_latency_list)
157+
p90_latency = np.percentile(next_latency_list, 90)
158+
print(f"first-token-latency: {first_latency:.3f} sec.")
159+
if args.max_new_tokens > 1:
160+
print(f"rest-token-latency: {average_next_latency:.3f} sec.")
161+
print(f"P90-rest-token-latency: {p90_latency:.3f} sec.")
Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import argparse
2+
import re
3+
import os
4+
import uuid
5+
import json
6+
from datetime import datetime
7+
8+
import pandas as pd
9+
10+
11+
def parse_args():
12+
parser = argparse.ArgumentParser(description='Parse LLM profiling log')
13+
parser.add_argument('log_file', help='Path to the LLM profiling log file')
14+
parser.add_argument('output_csv', help='Path to output CSV file')
15+
parser.add_argument('--tag', help='Tag for the benchmark run', default='')
16+
parser.add_argument('--model', help='Model name', default='unknown-model')
17+
parser.add_argument('--max-new-tokens', type=int, help='Maximum new tokens', default=128)
18+
parser.add_argument('--batch-size', type=int, help='Batch size', default=1)
19+
20+
return parser.parse_args()
21+
22+
23+
def parse_llm_log(log_file_path, tag, model, max_new_tokens, batch_size):
24+
"""Parse the LLM profiling log and extract performance metrics."""
25+
26+
with open(log_file_path, 'r', encoding='utf-8') as f:
27+
content = f.read()
28+
29+
metrics = {}
30+
31+
inference_match = re.search(r'inference-latency:\s+([\d.]+)\s+sec\.', content)
32+
if inference_match:
33+
metrics['inference_latency'] = float(inference_match.group(1))
34+
35+
first_token_match = re.search(r'first-token-latency:\s+([\d.]+)\s+sec\.', content)
36+
if first_token_match:
37+
metrics['first_token_latency'] = float(first_token_match.group(1))
38+
39+
rest_token_match = re.search(r'rest-token-latency:\s+([\d.]+)\s+sec\.', content)
40+
if rest_token_match:
41+
metrics['rest_token_latency'] = float(rest_token_match.group(1))
42+
43+
p90_match = re.search(r'P90-rest-token-latency:\s+([\d.]+)\s+sec\.', content)
44+
if p90_match:
45+
metrics['p90_rest_token_latency'] = float(p90_match.group(1))
46+
47+
prompt_match = re.search(r'Prompt size:\s+(\d+)', content)
48+
prompt_size = int(prompt_match.group(1)) if prompt_match else 1024
49+
50+
params = {
51+
'model': model,
52+
'input_tokens': prompt_size,
53+
'max_new_tokens': max_new_tokens,
54+
'batch_size': batch_size,
55+
}
56+
params_json = json.dumps(params)
57+
58+
rows = []
59+
run_uuid = uuid.uuid4().hex
60+
current_datetime = datetime.now().isoformat()
61+
62+
# Create one row for each metric
63+
for metric_name, metric_value in metrics.items():
64+
row = {
65+
'benchmark': 'e2e-flex-attention',
66+
'run_uuid': run_uuid,
67+
'datetime': current_datetime,
68+
'compiler': 'triton',
69+
'metric_name': metric_name,
70+
'metric_value': metric_value,
71+
'params': params_json,
72+
'tag': tag,
73+
}
74+
rows.append(row)
75+
76+
df_results = pd.DataFrame(rows)
77+
78+
host_info = {
79+
n: os.getenv(n.upper(), default='')
80+
for n in [
81+
'libigc1_version',
82+
'level_zero_version',
83+
'gpu_device',
84+
'agama_version',
85+
'torch_version',
86+
'compiler_version',
87+
'benchmarking_method',
88+
]
89+
}
90+
if not host_info['gpu_device']:
91+
raise RuntimeError('Could not find GPU device description, was `capture-hw-details.sh` called?')
92+
93+
for name, val in host_info.items():
94+
df_results[name] = val
95+
96+
print(f'Extracted metrics: {json.dumps(metrics, indent=2)}')
97+
print(f'DataFrame shape: {df_results.shape}')
98+
99+
return df_results
100+
101+
102+
def main():
103+
args = parse_args()
104+
if not os.path.exists(args.log_file):
105+
print(f'Error: Log file {args.log_file} not found')
106+
return 1
107+
108+
df_results = parse_llm_log(args.log_file, args.tag, args.model, args.max_new_tokens, args.batch_size)
109+
df_results.to_csv(args.output_csv, index=False)
110+
print(f'Transformed CSV saved to {args.output_csv}')
111+
return 0
112+
113+
114+
if __name__ == '__main__':
115+
main()
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
6e9972962fbc80d218234bfbd8c9b2843ef02b2b

0 commit comments

Comments
 (0)