Skip to content

Commit 0760724

Browse files
[CI] Add llama case for profile test (#1716)
Add llama inference test for call number test when profiling
1 parent b1757dd commit 0760724

File tree

5 files changed

+217
-13
lines changed

5 files changed

+217
-13
lines changed
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
#!/bin/bash
2+
file1="$1"
3+
file2="$2"
4+
5+
if [ ! -f "$file1" ] || [ ! -f "$file2" ]; then
6+
echo "Error: One or both files do not exist" >&2
7+
exit 1
8+
fi
9+
10+
if ! diff_output=$(diff <(sort "$file1") <(sort "$file2")); then
11+
echo "ERROR: Files $file1 and $file2 differ!" >&2
12+
echo "Differences found:" >&2
13+
echo "$diff_output" >&2
14+
fi
15+
16+
echo "SUCCESS: Files $file1 and $file2 are same"
17+
exit 0

.github/scripts/llama_baseline.csv

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
Operation Name,Section 1 Calls,Section 2 Calls,Section 3 Calls,Section 4 Calls,Section 5 Calls,Total Calls
2+
urEnqueueKernelLaunch,1590,1590,1590,1590,1590,7950
3+
"at::native::xpu::VectorizedElementwiseKernel<8, at::...",449,449,449,449,449,2245
4+
at::native::xpu::ElementwiseGlobalRangeKernel<at::na...,390,390,390,390,390,1950
5+
gemm_kernel,226,226,226,226,226,1130
6+
"at::native::xpu::VectorizedElementwiseKernel<4, at::...",210,210,210,210,210,1050
7+
urUSMDeviceAlloc,164,160,160,160,160,804
8+
Memset (DEVICE),160,160,160,160,160,800
9+
urEnqueueUSMFill,160,160,160,160,160,800
10+
at::native::xpu::UnrolledElementwiseKernel<at::nativ...,138,138,138,138,138,690
11+
"at::native::xpu::ReduceKernel<1, at::native::xpu::Re...",74,74,74,74,74,370
12+
micro_sdpa,32,32,32,32,32,160
13+
"at::native::xpu::VectorizedElementwiseKernel<2, at::...",16,16,16,16,16,80
14+
"at::native::xpu::VectorizedElementwiseKernel<16, at:...",14,14,14,14,14,70
15+
urEnqueueUSMMemcpy,10,10,10,10,10,50
16+
at::native::xpu::SegmentedRadixSortPairsDownsweepFun...,8,8,8,8,8,40
17+
at::native::xpu::SegmentedRadixSortPairsUpsweepFunct...,8,8,8,8,8,40
18+
at::native::xpu::SegmentedRadixSortPairsScanFunctor<...,8,8,8,8,8,40
19+
Memcpy D2M (DEVICE -> MEMORY(Unknown)),6,6,6,6,6,30
20+
at::native::xpu::SegmentScanKernel<at::native::xpu::...,4,4,4,4,4,20
21+
at::native::xpu::impl::SoftmaxForwardKernelFunctor<4...,2,2,2,2,2,10
22+
Memcpy H2D (HOST -> DEVICE),2,2,2,2,2,10
23+
at::native::xpu::CatArrayBatchedCopyKernelFunctor<lo...,2,2,2,2,2,10
24+
at::native::xpu::AssertAsyncKernelFunctor1<bool>,2,2,2,2,2,10
25+
Memcpy M2D (MEMORY(Unknown) -> DEVICE),2,2,2,2,2,10
26+
detach_,2,2,2,2,2,10
27+
at::native::xpu::SegmentedGroupRadixSelectPairsFunct...,1,1,1,1,1,5
28+
at::native::xpu::SegmentedGroupRadixSortPairsFunctor...,1,1,1,1,1,5
29+
at::native::xpu::ScatterGatherElementwiseKernelFunct...,1,1,1,1,1,5
30+
"at::native::xpu::ReduceKernel<2, at::native::xpu::Re...",1,1,1,1,1,5
31+
at::native::xpu::IndexKernel<at::native::xpu::IndexK...,1,1,1,1,1,5
32+
at::native::xpu::DistributionElementwiseKernelFuncto...,1,1,1,1,1,5
33+
at::native::xpu::AccumulateCarrierKernelFunctor<at::...,1,1,1,1,1,5
34+
urUSMHostAlloc,1,0,0,0,0,1

.github/scripts/llama_summary.py

Lines changed: 115 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,115 @@
1+
import re
2+
import csv
3+
import argparse
4+
from collections import defaultdict
5+
6+
def parse_log_sections(log_content):
7+
sections = []
8+
pattern = re.compile(r'^(datatype: torch\.float16 ; i: \d+)(.*?)(?=^datatype: |\Z)', re.MULTILINE | re.DOTALL)
9+
10+
for match in pattern.finditer(log_content):
11+
header = match.group(1).strip()
12+
content = match.group(2).strip()
13+
if content:
14+
sections.append((header, content))
15+
16+
return sections
17+
18+
def extract_non_aten_data(section_content):
19+
pattern = re.compile(
20+
r'^\s*([^\s].*?)\s+(\d+\.\d+%|\d+%)\s+(\d+\.\d+\w*s)\s+(\d+\.\d+%|\d+%)\s+(\d+\.\d+\w*s)\s+'
21+
r'(\d+\.\d+\w*s)\s+(\d+\.\d+\w*s)\s+(\d+\.\d+%|\d+%)\s+(\d+\.\d+\w*s)\s+'
22+
r'(\d+\.\d+\w*s)\s+(\d+)',
23+
re.MULTILINE
24+
)
25+
26+
section_data = []
27+
for match in pattern.finditer(section_content):
28+
name = match.group(1).strip()
29+
if not name.startswith('aten::'):
30+
num_calls = int(match.group(11))
31+
section_data.append((name, num_calls))
32+
33+
return section_data
34+
35+
def process_log_file(input_file):
36+
with open(input_file) as f:
37+
log_content = f.read()
38+
39+
sections = parse_log_sections(log_content)
40+
all_data = defaultdict(dict)
41+
section_headers = []
42+
duplicate_names = defaultdict(list)
43+
44+
print("\nFind the test log:")
45+
for i, (header, content) in enumerate(sections):
46+
print(f"[part {i+1}] {header}")
47+
section_headers.append(header)
48+
section_data = extract_non_aten_data(content)
49+
50+
# Track duplicate names within the same section
51+
seen_in_section = defaultdict(int)
52+
for name, num_calls in section_data:
53+
seen_in_section[name] += 1
54+
if seen_in_section[name] > 1:
55+
duplicate_names[name].append((i, num_calls))
56+
57+
for name, num_calls in section_data:
58+
all_data[name][i] = all_data[name].get(i, 0) + num_calls
59+
60+
# Print duplicate names and their calls
61+
if duplicate_names:
62+
print("\nDuplicate names found:")
63+
for name, calls in duplicate_names.items():
64+
print(f"Name: {name}")
65+
for section_idx, num_calls in calls:
66+
print(f" Section {section_idx+1}: {num_calls} calls")
67+
else:
68+
print("\nNo duplicate names found.")
69+
70+
return all_data, section_headers
71+
72+
def write_to_csv(data, section_headers, output_file):
73+
with open(output_file, 'w', newline='') as csvfile:
74+
writer = csv.writer(csvfile)
75+
76+
headers = ['Operation Name'] + [f"Section {i+1} Calls" for i in range(len(section_headers))] + ['Total Calls']
77+
writer.writerow(headers)
78+
for name, calls_data in sorted(data.items(), key=lambda x: sum(x[1].values()), reverse=True):
79+
row = [name]
80+
total = 0
81+
for i in range(len(section_headers)):
82+
calls = calls_data.get(i, 0)
83+
row.append(str(calls))
84+
total += calls
85+
row.append(str(total))
86+
writer.writerow(row)
87+
88+
print(f"\nGenerated result CSV file: {output_file}")
89+
90+
def main():
91+
parser = argparse.ArgumentParser(description='obtain the calls of nono aten op')
92+
parser.add_argument('-i', '--input', required=True, help='input log path')
93+
parser.add_argument('-o', '--output', default='output.csv',
94+
help='output summary file')
95+
96+
args = parser.parse_args()
97+
98+
try:
99+
print(f"\nProcessing the log file: {args.input}")
100+
csv_data, section_headers = process_log_file(args.input)
101+
102+
if csv_data:
103+
write_to_csv(csv_data, section_headers, args.output)
104+
print("\nThe summary of none aten op:")
105+
for name, calls in csv_data.items():
106+
print(f"{name}: {calls}")
107+
else:
108+
print("Warning: No none aten op")
109+
except FileNotFoundError:
110+
print(f"Error: Input file {args.input} not found")
111+
except Exception as e:
112+
print(f"Error when processing the Input file: {str(e)}")
113+
114+
if __name__ == "__main__":
115+
main()

.github/workflows/_linux_ut.yml

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -147,36 +147,43 @@ jobs:
147147
pip install pytest pytest-timeout
148148
- name: 'xpu_profiling'
149149
condition: ${{ inputs.driver == 'rolling' && contains(inputs.ut, 'xpu_profiling') }}
150+
directory: '$GITHUB_WORKSPACE'
150151
command_script: |
152+
mkdir -p $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce
151153
# RN50 Test
152154
PROFILE=1 python -u test/profiling/rn50.py -a resnet50 --dummy ./ --num-iterations 20 --xpu 0
153-
cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/profile_test
155+
cp profiling.fp32.train.pt $GITHUB_WORKSPACE/ut_log/xpu_profiling
154156
155157
# All Issue Reproduce UT
156158
python -u test/profiling/correlation_id_mixed.py | \
157-
tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/correlation_id_mixed.log
159+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/correlation_id_mixed.log
158160
python -u test/profiling/reproducer.missing.gpu.kernel.time.py | \
159-
tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/reproducer.missing.gpu.kernel.time.log
161+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/reproducer.missing.gpu.kernel.time.log
160162
python -u test/profiling/time_precision_in_profile.py | \
161-
tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/time_precision_in_profile.log
163+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/time_precision_in_profile.log
162164
python -u test/profiling/profile_partial_runtime_ops.py | \
163-
tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/profile_partial_runtime_ops.log
165+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/profile_partial_runtime_ops.log
164166
python -u test/profiling/triton_xpu_ops_time.py | \
165-
tee $GITHUB_WORKSPACE/ut_log/profile_test/issue_reproduce/triton_xpu_ops_time.log
166-
167+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/issue_reproduce/triton_xpu_ops_time.log
168+
169+
# llama case for calls number test
170+
python test/profiling/llama.py | \
171+
tee ${{ github.workspace }}/ut_log/xpu_profiling/llama.log
172+
python .github/scripts/llama_summary.py -i ${{ github.workspace }}/ut_log/xpu_profiling/llama.log -o ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
173+
bash .github/scripts/check_baseline.sh .github/scripts/llama_baseline.csv ${{ github.workspace }}/ut_log/xpu_profiling/llama_summary.csv
174+
167175
# All xpu ut under test/profiler
168176
cd ../pytorch/test/profiler
169177
python -m pytest --timeout 600 -vs test_cpp_thread.py | \
170-
tee $GITHUB_WORKSPACE/ut_log/profile_test/test_cpp_thread.log
178+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_cpp_thread.log
171179
python -m pytest --timeout 600 -vs test_execution_trace.py | \
172-
tee $GITHUB_WORKSPACE/ut_log/profile_test/test_execution_trace.log
180+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_execution_trace.log
173181
python -m pytest --timeout 600 -vs test_memory_profiler.py | \
174-
tee $GITHUB_WORKSPACE/ut_log/profile_test/test_memory_profiler.log
182+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_memory_profiler.log
175183
python -m pytest --timeout 600 -vs test_profiler_tree.py | \
176-
tee $GITHUB_WORKSPACE/ut_log/profile_test/test_profiler_tree.log
184+
tee $GITHUB_WORKSPACE/ut_log/xpu_profiling/test_profiler_tree.log
177185
additional_steps: |
178-
pip install pytest pytest-timeout
179-
mkdir -p ut_log/profile_test/issue_reproduce
186+
pip install pytest pytest-timeout transformers
180187
outputs:
181188
ut_name: ${{ steps.set-output.outputs.UT_NAME || '' }}
182189
steps:

test/profiling/llama.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,31 @@
1+
import torch
2+
from transformers import AutoModelForCausalLM, AutoTokenizer
3+
4+
model_name = "meta-llama/Meta-Llama-3-8B-Instruct"
5+
6+
tokenizer = AutoTokenizer.from_pretrained(model_name)
7+
model = AutoModelForCausalLM.from_pretrained(
8+
model_name,
9+
torch_dtype=torch.float16,
10+
)
11+
model.eval().to("xpu")
12+
13+
prompt = "If Alice is older than Bob, and Bob is older than Charlie, who is the youngest? Explain your reasoning."
14+
inputs = tokenizer(prompt, return_tensors="pt").to("xpu")
15+
16+
with torch.no_grad():
17+
for i in range(5):
18+
print(
19+
"datatype:",
20+
torch.float16,
21+
"; i:",
22+
i,
23+
)
24+
with torch.profiler.profile(
25+
activities=[
26+
torch.profiler.ProfilerActivity.CPU,
27+
torch.profiler.ProfilerActivity.XPU,
28+
]
29+
) as prof:
30+
outputs = model.generate(**inputs, max_new_tokens=1)
31+
print(prof.key_averages().table(sort_by="xpu_time_total", row_limit=-1))

0 commit comments

Comments
 (0)