Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -8,4 +8,5 @@ data/all_data_912
inference/outputs/*
inference/log/*

outputs
outputs
/data_copy
188 changes: 188 additions & 0 deletions analyze_ssb_outputs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#!/usr/bin/env python3
"""
Analyze SSB outputs across multiple runs and generate CSV with scores per task.
"""

import json
import os
import csv
from pathlib import Path
from typing import Dict, List, Optional


def load_json_results(json_path: str) -> Dict[str, float]:
"""Load JSON results and return dict mapping task_id to score."""
with open(json_path, "r") as f:
data = json.load(f)

results = {}
for entry in data:
# Normalize task_id to string (JSON has mixed int/str types)
task_id = str(entry["id"])
# Use result as score (1 for success, 0 for failure)
score = entry["result"]
results[task_id] = score

return results


def get_all_task_ids(spreadsheet_dir: str) -> List[str]:
"""Get all task IDs from the spreadsheet directory."""
task_ids = []
for item in sorted(os.listdir(spreadsheet_dir)):
path = os.path.join(spreadsheet_dir, item)
if os.path.isdir(path):
task_ids.append(item)
return task_ids


def main():
# Paths
ssb_outputs_dir = "ssb_outputs_2"
spreadsheet_dir = "data/all_data_912/spreadsheet"
output_csv = "ssb_scores_per_task.csv"

# Get all JSON files
json_files = sorted(
[
os.path.join(ssb_outputs_dir, f)
for f in os.listdir(ssb_outputs_dir)
if f.endswith(".json")
]
)

print(f"Found {len(json_files)} JSON files:")
for jf in json_files:
print(f" - {jf}")

# Load results from each run
run_results = []
for json_file in json_files:
results = load_json_results(json_file)
run_results.append(results)
print(f"Loaded {len(results)} results from {os.path.basename(json_file)}")

# Get all task IDs
all_task_ids = get_all_task_ids(spreadsheet_dir)
print(f"\nTotal tasks in dataset: {len(all_task_ids)}")

# Create CSV with scores per task
print(f"\nWriting CSV to {output_csv}...")
with open(output_csv, "w", newline="") as csvfile:
fieldnames = ["task_id"] + [f"run_{i + 1}" for i in range(len(json_files))]
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()

for task_id in all_task_ids:
row = {"task_id": task_id}
for i, run_result in enumerate(run_results):
# Use None/empty for tasks that didn't complete
row[f"run_{i + 1}"] = run_result.get(task_id, "")
writer.writerow(row)

print(f"CSV written successfully!")

# Analysis 1: % failures per run (tasks that didn't run/missing from JSON)
print("\n" + "=" * 60)
print("ANALYSIS 1: Task Failure Rate Per Run (Missing from JSON)")
print("=" * 60)
for i, run_result in enumerate(run_results):
total_tasks = len(all_task_ids)
completed_tasks = len(run_result)
failed_tasks = total_tasks - completed_tasks
failure_rate = (failed_tasks / total_tasks) * 100
print(
f"Run {i + 1}: {failed_tasks}/{total_tasks} tasks didn't run ({failure_rate:.2f}%)"
)

# Average success rate per run (for tasks that ran)
print("\n" + "=" * 60)
print("ANALYSIS 1.5: Average Success Rate Per Run (For Completed Tasks)")
print("=" * 60)
for i, run_result in enumerate(run_results):
if run_result:
avg_success = sum(run_result.values()) / len(run_result)
print(f"Run {i + 1}: {avg_success:.4f} ({len(run_result)} tasks completed)")
else:
print(f"Run {i + 1}: No tasks completed")

# Analysis 2: Tasks that failed (missing) across all 4 runs
print("\n" + "=" * 60)
print("ANALYSIS 2: Tasks That Failed (Missing) Across All Runs")
print("=" * 60)
failed_all_runs = []
for task_id in all_task_ids:
# Check if task is missing from all runs
missing_in_all = all(task_id not in run_result for run_result in run_results)
if missing_in_all:
failed_all_runs.append(task_id)

print(
f"Total tasks that failed in all {len(json_files)} runs: {len(failed_all_runs)}"
)
if failed_all_runs:
print("\nTasks:")
for task_id in failed_all_runs[:20]: # Show first 20
print(f" - {task_id}")
if len(failed_all_runs) > 20:
print(f" ... and {len(failed_all_runs) - 20} more")

# Analysis 3: Average of average task scores (excluding tasks that didn't run)
print("\n" + "=" * 60)
print("ANALYSIS 3: Average Task Scores (Excluding Missing Tasks)")
print("=" * 60)

task_averages = []
task_details = []
for task_id in all_task_ids:
scores = []
for run_result in run_results:
if task_id in run_result:
scores.append(run_result[task_id])

if scores: # Only compute average if task ran at least once
avg_score = sum(scores) / len(scores)
task_averages.append(avg_score)
task_details.append(
{
"task_id": task_id,
"scores": scores,
"avg": avg_score,
"num_runs": len(scores),
}
)

if task_averages:
overall_average = sum(task_averages) / len(task_averages)
print(f"Number of tasks that ran at least once: {len(task_averages)}")
print(f"Average of average task scores: {overall_average:.4f}")

# Show some examples to verify
print(f"\nSample of task averages (first 10):")
for detail in task_details[:10]:
print(
f" {detail['task_id']}: scores={detail['scores']}, avg={detail['avg']:.2f}, runs={detail['num_runs']}"
)

# Count how many tasks have perfect scores
perfect_tasks = sum(1 for avg in task_averages if avg == 1.0)
zero_tasks = sum(1 for avg in task_averages if avg == 0.0)
print(f"\nTasks with perfect avg (1.0): {perfect_tasks}")
print(f"Tasks with zero avg (0.0): {zero_tasks}")
print(
f"Tasks with mixed results: {len(task_averages) - perfect_tasks - zero_tasks}"
)
else:
print("No tasks ran in any run!")

# Additional stats
print("\n" + "=" * 60)
print("ADDITIONAL STATISTICS")
print("=" * 60)
print(f"Total unique tasks in dataset: {len(all_task_ids)}")
print(f"Tasks that ran at least once: {len(task_averages)}")
print(f"Tasks that never ran: {len(all_task_ids) - len(task_averages)}")


if __name__ == "__main__":
main()
105 changes: 105 additions & 0 deletions evaluation/README_modal.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
# Modal API for Spreadsheet Evaluation

This Modal app provides an API endpoint for evaluating spreadsheet files against ground truth data from the SpreadsheetBench dataset.

## Deployment

Deploy the Modal app **from the project root directory**:

```bash
modal deploy evaluation/modal_api.py --env benchmark
```

## Verification

After deployment, verify the endpoint is working correctly.

### Run Tests

The test script requires the `--url` parameter with your deployed API endpoint.

```bash
# Extract test data
tar -xzf data/all_data_912.tar.gz -C data_copy all_data_912/spreadsheet/17-35/

# Run full test suite
python evaluation/test_endpoint.py --url "modal url"

```

**Expected output for a passing test:**
```
================================================================================
EVALUATION RESULTS
================================================================================
✓ Success: True
✓ Overall Result: PASS
✓ ID: 17-35
✓ Instruction Type: Sheet-Level Manipulation
✓ Test Case Results: [True, True, True]
✓ Soft Restriction: 100.00%
✓ Hard Restriction: 1
```

See `evaluation/test_endpoint.py` for the complete test implementation.

## Usage

### Understanding Test Cases

Each spreadsheet task has **3 independent test cases**:
- Test case 1: `1_{id}_input.xlsx` → your model produces → `1_{id}_output.xlsx`
- Test case 2: `2_{id}_input.xlsx` → your model produces → `2_{id}_output.xlsx`
- Test case 3: `3_{id}_input.xlsx` → your model produces → `3_{id}_output.xlsx`

Each output file is evaluated against its corresponding answer file (`1_{id}_answer.xlsx`, `2_{id}_answer.xlsx`, `3_{id}_answer.xlsx`).

### API Endpoint

The API accepts POST requests with the following JSON format:

```json
{
"id": "17-35",
"outputs": {
"0": "base64_encoded_xlsx_for_test_case_1",
"1": "base64_encoded_xlsx_for_test_case_2",
"2": "base64_encoded_xlsx_for_test_case_3"
}
}
```

**Parameters:**
- `id` (required): The spreadsheet ID from the dataset
- `outputs` (required): Dictionary mapping test case index (as string "0", "1", "2") to base64-encoded .xlsx file content
- You can omit test cases if outputs are not available (e.g., only provide `{"0": "...", "2": "..."}`)

**Response:**
```json
{
"success": true,
"result": true,
"id": "17-35",
"instruction_type": "Sheet-Level Manipulation",
"test_case_results": [true, true, true],
"soft_restriction": 1.0,
"hard_restriction": 1,
"messages": [
"Test case 1: PASS - ",
"Test case 2: PASS - ",
"Test case 3: PASS - "
]
}
```

**Response Fields:**
- `success`: Whether the API call was successful
- `result`: True if `hard_restriction == 1` (all 3 test cases passed)
- `id`: The spreadsheet ID
- `instruction_type`: The type of instruction from the dataset
- `test_case_results`: Array of results for each test case (`true`/`false`/`null` for not provided)
- `soft_restriction`: Ratio of passing test cases out of 3 (0.0 to 1.0). **Note:** Missing test cases count as failures, matching `evaluation.py` behavior
- `hard_restriction`: 1 if all 3 test cases pass, 0 otherwise
- `messages`: Detailed messages for each test case

**Important:** Following `evaluation.py` logic, missing test case outputs are counted as failures. If you provide only 2 files and both pass, `soft_restriction = 2/3 = 0.67`, not 1.0.
Loading