Skip to content

Commit a4e9b4f

Browse files
authored
[CI][E2E accuracy] Add report aggregation and comparison with reference for E2E accuracy (#4939)
Implements #4455 What is done: 1. Generation of concatenated report file (csv) that contains all rows from separate csv files for each configuration 2. Generation of aggregated report file (csv) that contains ~45 rows for all (suite, mode, dtype) configuration with pass rate and failed models. Also, same report is printed as an output 3. Comparison against reference results from https://github.com/intel/torch-xpu-ops/blob/main/.github/ci_expected_accuracy/check_expected.py Comparison against reference is not called if E2E is called with specific model or with a subset of models.
1 parent 7597ef1 commit a4e9b4f

File tree

4 files changed

+359
-0
lines changed

4 files changed

+359
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
b7ccec02a390667cbe397dccf6642d3c7e131c77

.github/workflows/e2e-accuracy.yml

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -132,3 +132,54 @@ jobs:
132132
only_one_model: ${{ inputs.only_one_model }}
133133
runner_label: ${{ inputs.runner_label || 'max1100' }}
134134
TORCH_COMPILE_DEBUG: ${{ inputs.TORCH_COMPILE_DEBUG }}
135+
136+
summary:
137+
name: Aggregate and check results
138+
needs: [run_tests, setup]
139+
runs-on: linux
140+
if: always()
141+
steps:
142+
- name: Checkout repository
143+
uses: actions/checkout@v4
144+
145+
- name: Setup Python
146+
uses: actions/setup-python@v5
147+
with:
148+
python-version: "3.10"
149+
150+
- name: Download all artifacts
151+
uses: actions/download-artifact@v4
152+
with:
153+
path: separate-reports
154+
155+
- name: Run aggregation script
156+
run: |
157+
pip install pandas numpy
158+
ls -la separate-reports
159+
echo "Local dir"
160+
ls -la ./
161+
python scripts/e2e_checks/aggregate_e2e_results.py \
162+
--input-dir separate-reports \
163+
--output-dir aggregated-results
164+
165+
- name: Upload aggregated results
166+
uses: actions/upload-artifact@v4
167+
with:
168+
name: aggregated-results-${{ github.run_id }}
169+
path: aggregated-results
170+
include-hidden-files: true
171+
172+
- name: Check results against reference
173+
if: ${{ inputs.models == 'all' && inputs.only_one_model == '' }}
174+
run: |
175+
PYTORCH_XPU_OPS_REF="$(<.github/pins/e2e_reference_torch-xpu-ops.txt)"
176+
git clone https://github.com/intel/torch-xpu-ops.git
177+
cd torch-xpu-ops
178+
git checkout $PYTORCH_XPU_OPS_REF
179+
cd ..
180+
./scripts/e2e_checks/compare_reference.sh \
181+
separate-reports \
182+
"./torch-xpu-ops" \
183+
'${{ needs.setup.outputs.suite }}' \
184+
'${{ needs.setup.outputs.mode }}' \
185+
'${{ needs.setup.outputs.dtype }}'
Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
import argparse
2+
from pathlib import Path
3+
import pandas as pd
4+
5+
6+
def parse_args():
7+
parser = argparse.ArgumentParser(description='Aggregate end-to-end test results')
8+
parser.add_argument('--input-dir', '-i', type=str, required=True, help='Input directory containing test results')
9+
parser.add_argument('--output-dir', '-o', type=str, required=True, help='Output directory for aggregated results')
10+
return parser.parse_args()
11+
12+
13+
def parse_folder_name(folder_name):
14+
"""
15+
Parse folder name to extract suite and dtype.
16+
17+
Expected format: logs-{suite}-{dtype}-{mode}-accuracy, where mode can contain `-` characters
18+
Examples:
19+
- logs-torchbench-float32-inference-accuracy -> suite=torchbench, dtype=float32
20+
- logs-huggingface-amp_bf16-training-accuracy -> suite=huggingface, dtype=amp_bf16
21+
"""
22+
parts = folder_name.split('-')
23+
24+
# Check if it follows the expected pattern
25+
if len(parts) < 4 or parts[0] != 'logs' or parts[-1] != 'accuracy':
26+
return None, None, None
27+
28+
suite = parts[1]
29+
dtype = parts[2]
30+
# Extract mode, can include dashes
31+
mode = '-'.join(parts[3:-1])
32+
33+
return suite, dtype, mode
34+
35+
36+
def build_suite_report(combined_df, output_path):
37+
print('=======================================')
38+
print('= SUMMARY REPORT =')
39+
print('=======================================')
40+
assert combined_df.groupby(['suite', 'mode', 'dtype', 'batch_size',
41+
'name']).count().max().max() == 1, 'Discovered unexpected duplicates in results!'
42+
43+
def fn(df):
44+
results = df['accuracy'].value_counts().to_dict()
45+
errors = df[~df['accuracy'].str.startswith('pass')]
46+
errors = errors.groupby('accuracy')['name'].apply(';'.join).to_dict()
47+
48+
return results, errors
49+
50+
agg = combined_df.groupby(['suite', 'mode', 'dtype']).apply(fn, include_groups=False)
51+
52+
for index, row in agg.items():
53+
n_pass = sum(c for k, c in row[0].items() if k.startswith('pass'))
54+
n_total = sum(row[0].values())
55+
56+
join_parts = []
57+
for k, v in row[0].items():
58+
if 'pass' in k:
59+
join_parts.append(f'{k}={v}')
60+
else:
61+
join_parts.append(f'{k}={v}[{row[1][k]}]')
62+
63+
txt = f'suite={index[0]},mode={index[1]},dtype={index[2]},' + \
64+
f'passrate={n_pass / n_total if n_total > 0 else 0:.1%},' + \
65+
','.join(join_parts)
66+
67+
print(txt)
68+
69+
# Unpack errors and failed models into new columns
70+
agg = agg.apply(lambda x: pd.Series({**x[0], **{k + '_models': v for k, v in x[1].items()}}))
71+
agg = agg.reset_index().fillna(0)
72+
73+
agg.to_csv(output_path / 'summary_agg.csv', index=False)
74+
75+
76+
def drop_duplicates(df, suite, mode):
77+
""" Some (name, dtype) groups can have duplicates, let's print them """
78+
group_counts = df.groupby(['name', 'dtype']).size()
79+
duplicates = group_counts[group_counts > 1]
80+
81+
if not duplicates.empty:
82+
print(f'Found {len(duplicates)} duplicate groups for {suite} {mode}:')
83+
for (name, dtype), _ in duplicates.items():
84+
print(df[df['name'].eq(name) & df['dtype'].eq(dtype)])
85+
print()
86+
return df.groupby(['name', 'dtype'], as_index=False).first()
87+
88+
89+
def build_pytorch_report(combined_df, output_path):
90+
print('====================\nBuiling pytorch report\n====================')
91+
cols = ['name', 'float32', 'bfloat16', 'float16', 'amp_bf16', 'amp_fp16']
92+
93+
torch_report_dir = output_path / 'torch_format_report'
94+
torch_report_dir.mkdir(parents=True, exist_ok=True)
95+
for suite, mode in combined_df[['suite', 'mode']].drop_duplicates().values:
96+
df_subset = combined_df[combined_df['suite'].eq(suite)
97+
& combined_df['mode'].eq(mode)][['dtype', 'name', 'accuracy']]
98+
99+
df_subset = drop_duplicates(df_subset, suite, mode)
100+
pivoted_df = df_subset.pivot(index='name', columns='dtype', values='accuracy')
101+
102+
# Reset index to make 'name' a regular column
103+
pivoted_df = pivoted_df.reset_index()
104+
105+
# Fill NaN values if some dtype/name combinations don't exist
106+
pivoted_df = pivoted_df.fillna('')
107+
108+
pivoted_df = pivoted_df[[c for c in cols if c in pivoted_df.columns]]
109+
110+
pivoted_df.to_csv(torch_report_dir / f'inductor_{suite}_{mode}.csv', index=False)
111+
112+
113+
def main(input_dir, output_dir):
114+
"""
115+
Main function to aggregate end-to-end test results.
116+
117+
Args:
118+
input_dir (str): Path to input directory containing test results
119+
output_dir (str): Path to output directory for aggregated results
120+
"""
121+
input_path = Path(input_dir)
122+
output_path = Path(output_dir)
123+
124+
if not input_path.exists():
125+
raise FileNotFoundError(f'Input directory does not exist: {input_path}')
126+
127+
output_path.mkdir(parents=True, exist_ok=True)
128+
129+
print(f'Processing results from: {input_path}')
130+
print(f'Output will be saved to: {output_path}')
131+
132+
dfs = []
133+
for item_path in input_path.iterdir():
134+
name = item_path.name
135+
if not item_path.is_dir():
136+
continue
137+
138+
suite, dtype, mode = parse_folder_name(name)
139+
if suite is None:
140+
print(f'Folder name \'{name}\' does not match expected pattern, skipping')
141+
continue
142+
filepath = item_path / suite / dtype / f'inductor_{suite}_{dtype}_{mode}_xpu_accuracy.csv'
143+
df = pd.read_csv(filepath)
144+
df['suite'] = suite
145+
df['mode'] = mode
146+
df['dtype'] = dtype
147+
dfs.append(df)
148+
149+
combined_df = pd.concat(dfs, ignore_index=True)
150+
combined_df = combined_df.sort_values(['suite', 'mode', 'dtype'])
151+
152+
# Artifacts
153+
# 1. Simple concat of all with added suite, mode, dtype
154+
combined_df.to_csv(output_path / 'combined_results.csv', index=False)
155+
# 2. torch format report, 9 items (suite, mode), dtype stored as column
156+
build_pytorch_report(combined_df, output_path=output_path)
157+
# 3. Agg report with 45 rows (suite, mode, dtype, passed, failed_REASON, failed_REASON model list)
158+
build_suite_report(combined_df, output_path=output_path)
159+
160+
161+
if __name__ == '__main__':
162+
args = parse_args()
163+
main(args.input_dir, args.output_dir)
Lines changed: 144 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,144 @@
1+
#!/bin/bash
2+
3+
set -e # Exit on any error
4+
set -u # Exit on undefined variables
5+
6+
# Input arguments
7+
RESULT_DIR=$1
8+
TORCH_DIR=$2
9+
SUITES=${3:-'["huggingface", "torchbench", "timm_models"]'}
10+
MODES=${4:-'["training", "inference"]'}
11+
DTYPES=${5:-'["float32", "bfloat16", "float16", "amp_bf16", "amp_fp16"]'}
12+
13+
14+
echo "Raw suites: $SUITES"
15+
echo "Raw modes: $MODES"
16+
echo "Raw dtypes: $DTYPES"
17+
18+
19+
# Function to convert JSON array to space-separated string
20+
convert_json_array() {
21+
local input="$1"
22+
# Check if input looks like a JSON array
23+
if [[ "$input" =~ ^\[.*\]$ ]]; then
24+
python3 -c "import json; print(' '.join(json.loads('$input')))"
25+
else
26+
echo "$input"
27+
fi
28+
}
29+
30+
# Check if required arguments are provided
31+
if [ -z "$RESULT_DIR" ] || [ -z "$TORCH_DIR" ]; then
32+
echo "Usage: $0 <RESULT_DIR> <TORCH_DIR> [SUITES] [MODES] [DTYPES]" >&2
33+
exit 1
34+
fi
35+
36+
# Validate that directories exist
37+
if [ ! -d "$RESULT_DIR" ]; then
38+
echo "ERROR: RESULT_DIR '$RESULT_DIR' does not exist" >&2
39+
exit 1
40+
fi
41+
42+
if [ ! -d "$TORCH_DIR" ]; then
43+
echo "ERROR: TORCH_DIR '$TORCH_DIR' does not exist" >&2
44+
exit 1
45+
fi
46+
47+
# Check if the Python script exists
48+
PYTHON_SCRIPT="$TORCH_DIR/.github/ci_expected_accuracy/check_expected.py"
49+
if [ ! -f "$PYTHON_SCRIPT" ]; then
50+
echo "ERROR: Python script '$PYTHON_SCRIPT' not found" >&2
51+
exit 1
52+
fi
53+
54+
# Convert JSON arrays to space-separated strings if needed
55+
SUITES=$(convert_json_array "$SUITES")
56+
MODES=$(convert_json_array "$MODES")
57+
DTYPES=$(convert_json_array "$DTYPES")
58+
59+
# Convert space-separated strings to arrays
60+
IFS=' ' read -ra suites <<< "$SUITES"
61+
IFS=' ' read -ra modes <<< "$MODES"
62+
IFS=' ' read -ra dtypes <<< "$DTYPES"
63+
64+
# Variable to collect all failed model information
65+
failed_models_output=""
66+
missing_files=()
67+
exit_code=0
68+
69+
echo "Suites: $SUITES"
70+
echo "Modes: $MODES"
71+
echo "Dtypes: $DTYPES"
72+
73+
# Nested loops
74+
for suite in "${suites[@]}"; do
75+
for mode in "${modes[@]}"; do
76+
# Skip inference-with-freezing mode since there is reference for it in pytorch project
77+
if [ "$mode" = "inference-with-freezing" ]; then
78+
echo "Skipping mode: $mode"
79+
continue
80+
fi
81+
82+
for dtype in "${dtypes[@]}"; do
83+
CSV_FILE="$RESULT_DIR/logs-$suite-$dtype-$mode-accuracy/$suite/$dtype/inductor_${suite}_${dtype}_${mode}_xpu_accuracy.csv"
84+
85+
# Check if CSV file exists
86+
if [ ! -f "$CSV_FILE" ]; then
87+
echo "Missing: $CSV_FILE"
88+
missing_files+=("$CSV_FILE")
89+
continue
90+
fi
91+
92+
echo "Processing: $suite, $mode, $dtype"
93+
94+
# Run the Python script and capture output
95+
output=$(python "$PYTHON_SCRIPT" \
96+
--driver rolling \
97+
--suite "$suite" \
98+
--mode "$mode" \
99+
--dtype "$dtype" \
100+
--csv_file "$CSV_FILE")
101+
102+
# Print the output
103+
echo "$output"
104+
105+
# Extract and concatenate summary lines
106+
summary_lines=$(echo "$output" | grep -E "(Real failed models:|Summary for)")
107+
if [ -n "$summary_lines" ]; then
108+
failed_models_output="${failed_models_output}${summary_lines}"$'\n'
109+
fi
110+
111+
done
112+
done
113+
done
114+
115+
116+
echo "========================================="
117+
echo "Summary of all results:"
118+
echo "$failed_models_output"
119+
120+
# Find lines with actual failures (not "Real failed models: 0")
121+
failed_line=$(echo "$failed_models_output" | grep "Real failed models:" | grep -v "Real failed models: 0" || true)
122+
123+
echo "========================================="
124+
echo "Summary of only failed models:"
125+
if [ -n "$failed_line" ]; then
126+
echo "$failed_line"
127+
echo "ERROR: Found failed models!"
128+
exit_code=1
129+
else
130+
echo "SUCCESS: All models passed!"
131+
fi
132+
133+
# Check for missing files first
134+
if [ ${#missing_files[@]} -gt 0 ]; then
135+
echo "========================================="
136+
echo "ERROR: Missing files detected:"
137+
for file in "${missing_files[@]}"; do
138+
echo " $file"
139+
done
140+
echo "Total missing files: ${#missing_files[@]}"
141+
exit_code=1
142+
fi
143+
144+
exit $exit_code

0 commit comments

Comments
 (0)