RUCKBReasoning · pwang724 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025 · Sep 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -8,4 +8,5 @@ data/all_data_912
 inference/outputs/*
 inference/log/*
 
-outputs
+outputs
+/data_copy
diff --git a/analyze_ssb_outputs.py b/analyze_ssb_outputs.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+"""
+Analyze SSB outputs across multiple runs and generate CSV with scores per task.
+"""
+
+import json
+import os
+import csv
+from pathlib import Path
+from typing import Dict, List, Optional
+
+
+def load_json_results(json_path: str) -> Dict[str, float]:
+    """Load JSON results and return dict mapping task_id to score."""
+    with open(json_path, "r") as f:
+        data = json.load(f)
+
+    results = {}
+    for entry in data:
+        # Normalize task_id to string (JSON has mixed int/str types)
+        task_id = str(entry["id"])
+        # Use result as score (1 for success, 0 for failure)
+        score = entry["result"]
+        results[task_id] = score
+
+    return results
+
+
+def get_all_task_ids(spreadsheet_dir: str) -> List[str]:
+    """Get all task IDs from the spreadsheet directory."""
+    task_ids = []
+    for item in sorted(os.listdir(spreadsheet_dir)):
+        path = os.path.join(spreadsheet_dir, item)
+        if os.path.isdir(path):
+            task_ids.append(item)
+    return task_ids
+
+
+def main():
+    # Paths
+    ssb_outputs_dir = "ssb_outputs_2"
+    spreadsheet_dir = "data/all_data_912/spreadsheet"
+    output_csv = "ssb_scores_per_task.csv"
+
+    # Get all JSON files
+    json_files = sorted(
+        [
+            os.path.join(ssb_outputs_dir, f)
+            for f in os.listdir(ssb_outputs_dir)
+            if f.endswith(".json")
+        ]
+    )
+
+    print(f"Found {len(json_files)} JSON files:")
+    for jf in json_files:
+        print(f"  - {jf}")
+
+    # Load results from each run
+    run_results = []
+    for json_file in json_files:
+        results = load_json_results(json_file)
+        run_results.append(results)
+        print(f"Loaded {len(results)} results from {os.path.basename(json_file)}")
+
+    # Get all task IDs
+    all_task_ids = get_all_task_ids(spreadsheet_dir)
+    print(f"\nTotal tasks in dataset: {len(all_task_ids)}")
+
+    # Create CSV with scores per task
+    print(f"\nWriting CSV to {output_csv}...")
+    with open(output_csv, "w", newline="") as csvfile:
+        fieldnames = ["task_id"] + [f"run_{i + 1}" for i in range(len(json_files))]
+        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
+        writer.writeheader()
+
+        for task_id in all_task_ids:
+            row = {"task_id": task_id}
+            for i, run_result in enumerate(run_results):
+                # Use None/empty for tasks that didn't complete
+                row[f"run_{i + 1}"] = run_result.get(task_id, "")
+            writer.writerow(row)
+
+    print(f"CSV written successfully!")
+
+    # Analysis 1: % failures per run (tasks that didn't run/missing from JSON)
+    print("\n" + "=" * 60)
+    print("ANALYSIS 1: Task Failure Rate Per Run (Missing from JSON)")
+    print("=" * 60)
+    for i, run_result in enumerate(run_results):
+        total_tasks = len(all_task_ids)
+        completed_tasks = len(run_result)
+        failed_tasks = total_tasks - completed_tasks
+        failure_rate = (failed_tasks / total_tasks) * 100
+        print(
+            f"Run {i + 1}: {failed_tasks}/{total_tasks} tasks didn't run ({failure_rate:.2f}%)"
+        )
+
+    # Average success rate per run (for tasks that ran)
+    print("\n" + "=" * 60)
+    print("ANALYSIS 1.5: Average Success Rate Per Run (For Completed Tasks)")
+    print("=" * 60)
+    for i, run_result in enumerate(run_results):
+        if run_result:
+            avg_success = sum(run_result.values()) / len(run_result)
+            print(f"Run {i + 1}: {avg_success:.4f} ({len(run_result)} tasks completed)")
+        else:
+            print(f"Run {i + 1}: No tasks completed")
+
+    # Analysis 2: Tasks that failed (missing) across all 4 runs
+    print("\n" + "=" * 60)
+    print("ANALYSIS 2: Tasks That Failed (Missing) Across All Runs")
+    print("=" * 60)
+    failed_all_runs = []
+    for task_id in all_task_ids:
+        # Check if task is missing from all runs
+        missing_in_all = all(task_id not in run_result for run_result in run_results)
+        if missing_in_all:
+            failed_all_runs.append(task_id)
+
+    print(
+        f"Total tasks that failed in all {len(json_files)} runs: {len(failed_all_runs)}"
+    )
+    if failed_all_runs:
+        print("\nTasks:")
+        for task_id in failed_all_runs[:20]:  # Show first 20
+            print(f"  - {task_id}")
+        if len(failed_all_runs) > 20:
+            print(f"  ... and {len(failed_all_runs) - 20} more")
+
+    # Analysis 3: Average of average task scores (excluding tasks that didn't run)
+    print("\n" + "=" * 60)
+    print("ANALYSIS 3: Average Task Scores (Excluding Missing Tasks)")
+    print("=" * 60)
+
+    task_averages = []
+    task_details = []
+    for task_id in all_task_ids:
+        scores = []
+        for run_result in run_results:
+            if task_id in run_result:
+                scores.append(run_result[task_id])
+
+        if scores:  # Only compute average if task ran at least once
+            avg_score = sum(scores) / len(scores)
+            task_averages.append(avg_score)
+            task_details.append(
+                {
+                    "task_id": task_id,
+                    "scores": scores,
+                    "avg": avg_score,
+                    "num_runs": len(scores),
+                }
+            )
+
+    if task_averages:
+        overall_average = sum(task_averages) / len(task_averages)
+        print(f"Number of tasks that ran at least once: {len(task_averages)}")
+        print(f"Average of average task scores: {overall_average:.4f}")
+
+        # Show some examples to verify
+        print(f"\nSample of task averages (first 10):")
+        for detail in task_details[:10]:
+            print(
+                f"  {detail['task_id']}: scores={detail['scores']}, avg={detail['avg']:.2f}, runs={detail['num_runs']}"
+            )
+
+        # Count how many tasks have perfect scores
+        perfect_tasks = sum(1 for avg in task_averages if avg == 1.0)
+        zero_tasks = sum(1 for avg in task_averages if avg == 0.0)
+        print(f"\nTasks with perfect avg (1.0): {perfect_tasks}")
+        print(f"Tasks with zero avg (0.0): {zero_tasks}")
+        print(
+            f"Tasks with mixed results: {len(task_averages) - perfect_tasks - zero_tasks}"
+        )
+    else:
+        print("No tasks ran in any run!")
+
+    # Additional stats
+    print("\n" + "=" * 60)
+    print("ADDITIONAL STATISTICS")
+    print("=" * 60)
+    print(f"Total unique tasks in dataset: {len(all_task_ids)}")
+    print(f"Tasks that ran at least once: {len(task_averages)}")
+    print(f"Tasks that never ran: {len(all_task_ids) - len(task_averages)}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/evaluation/README_modal.md b/evaluation/README_modal.md
@@ -0,0 +1,105 @@
+# Modal API for Spreadsheet Evaluation
+
+This Modal app provides an API endpoint for evaluating spreadsheet files against ground truth data from the SpreadsheetBench dataset.
+
+## Deployment
+
+Deploy the Modal app **from the project root directory**:
+
+```bash
+modal deploy evaluation/modal_api.py --env benchmark
+```
+
+## Verification
+
+After deployment, verify the endpoint is working correctly.
+
+### Run Tests
+
+The test script requires the `--url` parameter with your deployed API endpoint.
+
+```bash
+# Extract test data
+tar -xzf data/all_data_912.tar.gz -C data_copy all_data_912/spreadsheet/17-35/
+
+# Run full test suite
+python evaluation/test_endpoint.py --url "modal url"
+
+```
+
+**Expected output for a passing test:**
+```
+================================================================================
+EVALUATION RESULTS
+================================================================================
+✓ Success: True
+✓ Overall Result: PASS
+✓ ID: 17-35
+✓ Instruction Type: Sheet-Level Manipulation
+✓ Test Case Results: [True, True, True]
+✓ Soft Restriction: 100.00%
+✓ Hard Restriction: 1
+```
+
+See `evaluation/test_endpoint.py` for the complete test implementation.
+
+## Usage
+
+### Understanding Test Cases
+
+Each spreadsheet task has **3 independent test cases**:
+- Test case 1: `1_{id}_input.xlsx` → your model produces → `1_{id}_output.xlsx`
+- Test case 2: `2_{id}_input.xlsx` → your model produces → `2_{id}_output.xlsx`
+- Test case 3: `3_{id}_input.xlsx` → your model produces → `3_{id}_output.xlsx`
+
+Each output file is evaluated against its corresponding answer file (`1_{id}_answer.xlsx`, `2_{id}_answer.xlsx`, `3_{id}_answer.xlsx`).
+
+### API Endpoint
+
+The API accepts POST requests with the following JSON format:
+
+```json
+{
+  "id": "17-35",
+  "outputs": {
+    "0": "base64_encoded_xlsx_for_test_case_1",
+    "1": "base64_encoded_xlsx_for_test_case_2",
+    "2": "base64_encoded_xlsx_for_test_case_3"
+  }
+}
+```
+
+**Parameters:**
+- `id` (required): The spreadsheet ID from the dataset
+- `outputs` (required): Dictionary mapping test case index (as string "0", "1", "2") to base64-encoded .xlsx file content
+  - You can omit test cases if outputs are not available (e.g., only provide `{"0": "...", "2": "..."}`)
+
+**Response:**
+```json
+{
+  "success": true,
+  "result": true,
+  "id": "17-35",
+  "instruction_type": "Sheet-Level Manipulation",
+  "test_case_results": [true, true, true],
+  "soft_restriction": 1.0,
+  "hard_restriction": 1,
+  "messages": [
+    "Test case 1: PASS - ",
+    "Test case 2: PASS - ",
+    "Test case 3: PASS - "
+  ]
+}
+```
+
+**Response Fields:**
+- `success`: Whether the API call was successful
+- `result`: True if `hard_restriction == 1` (all 3 test cases passed)
+- `id`: The spreadsheet ID
+- `instruction_type`: The type of instruction from the dataset
+- `test_case_results`: Array of results for each test case (`true`/`false`/`null` for not provided)
+- `soft_restriction`: Ratio of passing test cases out of 3 (0.0 to 1.0). **Note:** Missing test cases count as failures, matching `evaluation.py` behavior
+- `hard_restriction`: 1 if all 3 test cases pass, 0 otherwise
+- `messages`: Detailed messages for each test case
+
+**Important:** Following `evaluation.py` logic, missing test case outputs are counted as failures. If you provide only 2 files and both pass, `soft_restriction = 2/3 = 0.67`, not 1.0.