[CI][E2E accuracy] Add report aggregation and comparison with reference for E2E accuracy (#4939)

Egor-Krivov · web-flow · commit a4e9b4ff1356 · 2025-08-26T19:45:02.000+02:00
Implements #4455 What is done: 1. Generation of concatenated report file (csv) that contains all rows from separate csv files for each configuration 2. Generation of aggregated report file (csv) that contains ~45 rows for all (suite, mode, dtype) configuration with pass rate and failed models. Also, same report is printed as an output 3. Comparison against reference results from https://github.com/intel/torch-xpu-ops/blob/main/.github/ci_expected_accuracy/check_expected.py Comparison against reference is not called if E2E is called with specific model or with a subset of models.
diff --git a/.github/pins/e2e_reference_torch-xpu-ops.txt b/.github/pins/e2e_reference_torch-xpu-ops.txt
@@ -0,0 +1 @@
+b7ccec02a390667cbe397dccf6642d3c7e131c77
diff --git a/.github/workflows/e2e-accuracy.yml b/.github/workflows/e2e-accuracy.yml
@@ -132,3 +132,54 @@ jobs:
       only_one_model: ${{ inputs.only_one_model }}
       runner_label: ${{ inputs.runner_label || 'max1100' }}
       TORCH_COMPILE_DEBUG: ${{ inputs.TORCH_COMPILE_DEBUG }}
+
+  summary:
+    name: Aggregate and check results
+    needs: [run_tests, setup]
+    runs-on: linux
+    if: always()
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v4
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Download all artifacts
+        uses: actions/download-artifact@v4
+        with:
+          path: separate-reports
+
+      - name: Run aggregation script
+        run: |
+          pip install pandas numpy
+          ls -la separate-reports
+          echo "Local dir"
+          ls -la ./
+          python scripts/e2e_checks/aggregate_e2e_results.py \
+             --input-dir separate-reports \
+             --output-dir aggregated-results
+
+      - name: Upload aggregated results
+        uses: actions/upload-artifact@v4
+        with:
+          name: aggregated-results-${{ github.run_id }}
+          path: aggregated-results
+          include-hidden-files: true
+
+      - name: Check results against reference
+        if: ${{ inputs.models == 'all' && inputs.only_one_model == '' }}
+        run: |
+          PYTORCH_XPU_OPS_REF="$(<.github/pins/e2e_reference_torch-xpu-ops.txt)"
+          git clone https://github.com/intel/torch-xpu-ops.git
+          cd torch-xpu-ops
+          git checkout $PYTORCH_XPU_OPS_REF
+          cd ..
+          ./scripts/e2e_checks/compare_reference.sh \
+            separate-reports \
+            "./torch-xpu-ops" \
+            '${{ needs.setup.outputs.suite }}' \
+            '${{ needs.setup.outputs.mode }}' \
+            '${{ needs.setup.outputs.dtype }}'
diff --git a/scripts/e2e_checks/aggregate_e2e_results.py b/scripts/e2e_checks/aggregate_e2e_results.py
@@ -0,0 +1,163 @@
+import argparse
+from pathlib import Path
+import pandas as pd
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Aggregate end-to-end test results')
+    parser.add_argument('--input-dir', '-i', type=str, required=True, help='Input directory containing test results')
+    parser.add_argument('--output-dir', '-o', type=str, required=True, help='Output directory for aggregated results')
+    return parser.parse_args()
+
+
+def parse_folder_name(folder_name):
+    """
+    Parse folder name to extract suite and dtype.
+
+    Expected format: logs-{suite}-{dtype}-{mode}-accuracy, where mode can contain `-` characters
+    Examples:
+    - logs-torchbench-float32-inference-accuracy -> suite=torchbench, dtype=float32
+    - logs-huggingface-amp_bf16-training-accuracy -> suite=huggingface, dtype=amp_bf16
+    """
+    parts = folder_name.split('-')
+
+    # Check if it follows the expected pattern
+    if len(parts) < 4 or parts[0] != 'logs' or parts[-1] != 'accuracy':
+        return None, None, None
+
+    suite = parts[1]
+    dtype = parts[2]
+    # Extract mode, can include dashes
+    mode = '-'.join(parts[3:-1])
+
+    return suite, dtype, mode
+
+
+def build_suite_report(combined_df, output_path):
+    print('=======================================')
+    print('=           SUMMARY REPORT            =')
+    print('=======================================')
+    assert combined_df.groupby(['suite', 'mode', 'dtype', 'batch_size',
+                                'name']).count().max().max() == 1, 'Discovered unexpected duplicates in results!'
+
+    def fn(df):
+        results = df['accuracy'].value_counts().to_dict()
+        errors = df[~df['accuracy'].str.startswith('pass')]
+        errors = errors.groupby('accuracy')['name'].apply(';'.join).to_dict()
+
+        return results, errors
+
+    agg = combined_df.groupby(['suite', 'mode', 'dtype']).apply(fn, include_groups=False)
+
+    for index, row in agg.items():
+        n_pass = sum(c for k, c in row[0].items() if k.startswith('pass'))
+        n_total = sum(row[0].values())
+
+        join_parts = []
+        for k, v in row[0].items():
+            if 'pass' in k:
+                join_parts.append(f'{k}={v}')
+            else:
+                join_parts.append(f'{k}={v}[{row[1][k]}]')
+
+        txt = f'suite={index[0]},mode={index[1]},dtype={index[2]},' + \
+        f'passrate={n_pass / n_total if n_total > 0 else 0:.1%},' + \
+        ','.join(join_parts)
+
+        print(txt)
+
+    # Unpack errors and failed models into new columns
+    agg = agg.apply(lambda x: pd.Series({**x[0], **{k + '_models': v for k, v in x[1].items()}}))
+    agg = agg.reset_index().fillna(0)
+
+    agg.to_csv(output_path / 'summary_agg.csv', index=False)
+
+
+def drop_duplicates(df, suite, mode):
+    """ Some (name, dtype) groups can have duplicates, let's print them """
+    group_counts = df.groupby(['name', 'dtype']).size()
+    duplicates = group_counts[group_counts > 1]
+
+    if not duplicates.empty:
+        print(f'Found {len(duplicates)} duplicate groups for {suite} {mode}:')
+        for (name, dtype), _ in duplicates.items():
+            print(df[df['name'].eq(name) & df['dtype'].eq(dtype)])
+            print()
+    return df.groupby(['name', 'dtype'], as_index=False).first()
+
+
+def build_pytorch_report(combined_df, output_path):
+    print('====================\nBuiling pytorch report\n====================')
+    cols = ['name', 'float32', 'bfloat16', 'float16', 'amp_bf16', 'amp_fp16']
+
+    torch_report_dir = output_path / 'torch_format_report'
+    torch_report_dir.mkdir(parents=True, exist_ok=True)
+    for suite, mode in combined_df[['suite', 'mode']].drop_duplicates().values:
+        df_subset = combined_df[combined_df['suite'].eq(suite)
+                                & combined_df['mode'].eq(mode)][['dtype', 'name', 'accuracy']]
+
+        df_subset = drop_duplicates(df_subset, suite, mode)
+        pivoted_df = df_subset.pivot(index='name', columns='dtype', values='accuracy')
+
+        # Reset index to make 'name' a regular column
+        pivoted_df = pivoted_df.reset_index()
+
+        # Fill NaN values if some dtype/name combinations don't exist
+        pivoted_df = pivoted_df.fillna('')
+
+        pivoted_df = pivoted_df[[c for c in cols if c in pivoted_df.columns]]
+
+        pivoted_df.to_csv(torch_report_dir / f'inductor_{suite}_{mode}.csv', index=False)
+
+
+def main(input_dir, output_dir):
+    """
+    Main function to aggregate end-to-end test results.
+
+    Args:
+        input_dir (str): Path to input directory containing test results
+        output_dir (str): Path to output directory for aggregated results
+    """
+    input_path = Path(input_dir)
+    output_path = Path(output_dir)
+
+    if not input_path.exists():
+        raise FileNotFoundError(f'Input directory does not exist: {input_path}')
+
+    output_path.mkdir(parents=True, exist_ok=True)
+
+    print(f'Processing results from: {input_path}')
+    print(f'Output will be saved to: {output_path}')
+
+    dfs = []
+    for item_path in input_path.iterdir():
+        name = item_path.name
+        if not item_path.is_dir():
+            continue
+
+        suite, dtype, mode = parse_folder_name(name)
+        if suite is None:
+            print(f'Folder name \'{name}\' does not match expected pattern, skipping')
+            continue
+        filepath = item_path / suite / dtype / f'inductor_{suite}_{dtype}_{mode}_xpu_accuracy.csv'
+        df = pd.read_csv(filepath)
+        df['suite'] = suite
+        df['mode'] = mode
+        df['dtype'] = dtype
+        dfs.append(df)
+
+    combined_df = pd.concat(dfs, ignore_index=True)
+    combined_df = combined_df.sort_values(['suite', 'mode', 'dtype'])
+
+    # Artifacts
+    # 1. Simple concat of all with added suite, mode, dtype
+    combined_df.to_csv(output_path / 'combined_results.csv', index=False)
+    # 2. torch format report, 9 items (suite, mode), dtype stored as column
+    build_pytorch_report(combined_df, output_path=output_path)
+    # 3. Agg report with 45 rows (suite, mode, dtype, passed, failed_REASON, failed_REASON model list)
+    build_suite_report(combined_df, output_path=output_path)
+
+
+if __name__ == '__main__':
+    args = parse_args()
+    main(args.input_dir, args.output_dir)
diff --git a/scripts/e2e_checks/compare_reference.sh b/scripts/e2e_checks/compare_reference.sh
@@ -0,0 +1,144 @@
+#!/bin/bash
+
+set -e  # Exit on any error
+set -u  # Exit on undefined variables
+
+# Input arguments
+RESULT_DIR=$1
+TORCH_DIR=$2
+SUITES=${3:-'["huggingface", "torchbench", "timm_models"]'}
+MODES=${4:-'["training", "inference"]'}
+DTYPES=${5:-'["float32", "bfloat16", "float16", "amp_bf16", "amp_fp16"]'}
+
+
+echo "Raw suites: $SUITES"
+echo "Raw modes: $MODES"
+echo "Raw dtypes: $DTYPES"
+
+
+# Function to convert JSON array to space-separated string
+convert_json_array() {
+    local input="$1"
+    # Check if input looks like a JSON array
+    if [[ "$input" =~ ^\[.*\]$ ]]; then
+        python3 -c "import json; print(' '.join(json.loads('$input')))"
+    else
+        echo "$input"
+    fi
+}
+
+# Check if required arguments are provided
+if [ -z "$RESULT_DIR" ] || [ -z "$TORCH_DIR" ]; then
+    echo "Usage: $0 <RESULT_DIR> <TORCH_DIR> [SUITES] [MODES] [DTYPES]" >&2
+    exit 1
+fi
+
+# Validate that directories exist
+if [ ! -d "$RESULT_DIR" ]; then
+    echo "ERROR: RESULT_DIR '$RESULT_DIR' does not exist" >&2
+    exit 1
+fi
+
+if [ ! -d "$TORCH_DIR" ]; then
+    echo "ERROR: TORCH_DIR '$TORCH_DIR' does not exist" >&2
+    exit 1
+fi
+
+# Check if the Python script exists
+PYTHON_SCRIPT="$TORCH_DIR/.github/ci_expected_accuracy/check_expected.py"
+if [ ! -f "$PYTHON_SCRIPT" ]; then
+    echo "ERROR: Python script '$PYTHON_SCRIPT' not found" >&2
+    exit 1
+fi
+
+# Convert JSON arrays to space-separated strings if needed
+SUITES=$(convert_json_array "$SUITES")
+MODES=$(convert_json_array "$MODES")
+DTYPES=$(convert_json_array "$DTYPES")
+
+# Convert space-separated strings to arrays
+IFS=' ' read -ra suites <<< "$SUITES"
+IFS=' ' read -ra modes <<< "$MODES"
+IFS=' ' read -ra dtypes <<< "$DTYPES"
+
+# Variable to collect all failed model information
+failed_models_output=""
+missing_files=()
+exit_code=0
+
+echo "Suites: $SUITES"
+echo "Modes: $MODES"
+echo "Dtypes: $DTYPES"
+
+# Nested loops
+for suite in "${suites[@]}"; do
+    for mode in "${modes[@]}"; do
+        # Skip inference-with-freezing mode since there is reference for it in pytorch project
+        if [ "$mode" = "inference-with-freezing" ]; then
+            echo "Skipping mode: $mode"
+            continue
+        fi
+
+        for dtype in "${dtypes[@]}"; do
+            CSV_FILE="$RESULT_DIR/logs-$suite-$dtype-$mode-accuracy/$suite/$dtype/inductor_${suite}_${dtype}_${mode}_xpu_accuracy.csv"
+
+            # Check if CSV file exists
+            if [ ! -f "$CSV_FILE" ]; then
+                echo "Missing: $CSV_FILE"
+                missing_files+=("$CSV_FILE")
+                continue
+            fi
+
+            echo "Processing: $suite, $mode, $dtype"
+
+            # Run the Python script and capture output
+            output=$(python "$PYTHON_SCRIPT" \
+                --driver rolling \
+                --suite "$suite" \
+                --mode "$mode" \
+                --dtype "$dtype" \
+                --csv_file "$CSV_FILE")
+
+            # Print the output
+            echo "$output"
+
+            # Extract and concatenate summary lines
+            summary_lines=$(echo "$output" | grep -E "(Real failed models:|Summary for)")
+            if [ -n "$summary_lines" ]; then
+                failed_models_output="${failed_models_output}${summary_lines}"$'\n'
+            fi
+
+        done
+    done
+done
+
+
+echo "========================================="
+echo "Summary of all results:"
+echo "$failed_models_output"
+
+# Find lines with actual failures (not "Real failed models: 0")
+failed_line=$(echo "$failed_models_output" | grep "Real failed models:" | grep -v "Real failed models: 0" || true)
+
+echo "========================================="
+echo "Summary of only failed models:"
+if [ -n "$failed_line" ]; then
+    echo "$failed_line"
+    echo "ERROR: Found failed models!"
+    exit_code=1
+else
+    echo "SUCCESS: All models passed!"
+fi
+
+# Check for missing files first
+if [ ${#missing_files[@]} -gt 0 ]; then
+    echo "========================================="
+    echo "ERROR: Missing files detected:"
+    for file in "${missing_files[@]}"; do
+        echo "  $file"
+    done
+    echo "Total missing files: ${#missing_files[@]}"
+    exit_code=1
+fi
+
+exit $exit_code

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+b7ccec02a390667cbe397dccf6642d3c7e131c77`