Future-House
diff --git a/‎.github/workflows/tests.yml‎
Lines changed: 5 additions & 2 deletions b/‎.github/workflows/tests.yml‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎bixbench/__init__.py‎
Lines changed: 7 additions & 8 deletions b/‎bixbench/__init__.py‎
Lines changed: 7 additions & 8 deletions
diff --git a/‎bixbench/generate_traces.py‎
Lines changed: 6 additions & 7 deletions b/‎bixbench/generate_traces.py‎
Lines changed: 6 additions & 7 deletions
diff --git a/‎bixbench/graders.py‎
Lines changed: 4 additions & 5 deletions b/‎bixbench/graders.py‎
Lines changed: 4 additions & 5 deletions
diff --git a/‎bixbench/plotting_utils.py‎
Lines changed: 3 additions & 3 deletions b/‎bixbench/plotting_utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎bixbench/postprocessing.py‎
Lines changed: 29 additions & 23 deletions b/‎bixbench/postprocessing.py‎
Lines changed: 29 additions & 23 deletions
@@ -23,13 +23,16 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install autopep8 pytest numpy setuptools>=66 wheel>=0.36 build
+        pip install ruff pytest numpy setuptools>=66 wheel>=0.36 build
         if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
         if [ -f pyproject.toml ]; then pip install -e .; fi
     
     - name: Run Lint
       run: |
-        python -m autopep8 --diff --exit-code --recursive .
+        # Check for linting issues
+        ruff check .
+        # Check for formatting issues (will fail if code needs formatting)
+        ruff format --check .
 
   test:
     runs-on: ubuntu-latest
 
@@ -1,20 +1,19 @@
-from .utils import randomize_choices, parse_response, EvalMode, AgentInput, LLMConfig
 from .prompts import (
     MCQ_PROMPT_TEMPLATE_WITH_REFUSAL,
     MCQ_PROMPT_TEMPLATE_WITHOUT_REFUSAL,
     OPEN_ENDED_PROMPT_TEMPLATE,
 )
+from .utils import AgentInput, EvalMode, LLMConfig, parse_response, randomize_choices
 from .zero_shot import ZeroshotBaseline
-from .graders import grade_mcq_answer, grade_open_ended_answer, compute_metrics
 
 __all__ = [
-    "randomize_choices",
-    "parse_response",
-    "EvalMode",
-    "AgentInput",
-    "LLMConfig",
-    "MCQ_PROMPT_TEMPLATE_WITH_REFUSAL",
     "MCQ_PROMPT_TEMPLATE_WITHOUT_REFUSAL",
+    "MCQ_PROMPT_TEMPLATE_WITH_REFUSAL",
     "OPEN_ENDED_PROMPT_TEMPLATE",
+    "AgentInput",
+    "EvalMode",
+    "LLMConfig",
     "ZeroshotBaseline",
+    "parse_response",
+    "randomize_choices",
 ]
@@ -4,19 +4,18 @@
 import logging
 import shutil
 from pathlib import Path
-import yaml
 
 import datasets
+import yaml
+from aviary.utils import EvalAnswerMode
+from fhda import prompts
+from fhda.data_analysis_env import DataAnalysisEnv
+from fhda.utils import NBLanguage, collect_notebook_stats, load_mcq
 from huggingface_hub import hf_hub_download
 from ldp.agent import AgentConfig
 from ldp.alg.rollout import RolloutManager
 from ldp.data_structures import Trajectory
 
-from fhda import prompts
-from fhda.data_analysis_env import DataAnalysisEnv
-from fhda.utils import NBLanguage, load_mcq, collect_notebook_stats
-from aviary.utils import EvalAnswerMode
-
 logger = logging.getLogger(__name__)
 
 
@@ -105,7 +104,7 @@ async def load_bixbench(self) -> datasets.Dataset:
         return bixbench
 
     def _extract_and_process_files(self, zip_path: Path, extract_dir: Path):
-        """Helper method to extract and process zip files"""
+        """Helper method to extract and process zip files."""
         # Extract the zip file
         shutil.unpack_archive(zip_path, extract_dir)
 
 
@@ -1,7 +1,9 @@
-from .prompts import OPEN_ENDED_GRADING_PROMPT
 import re
+
 from aviary.core import Message
 
+from .prompts import OPEN_ENDED_GRADING_PROMPT
+
 
 def grade_mcq_answer(target, predicted, unsure):
     predicted = predicted.upper()
@@ -13,10 +15,7 @@ def grade_mcq_answer(target, predicted, unsure):
     # Only for MCQ + w/resusal setting.Used to compute precision
     refusal = predicted != unsure
 
-    if correct:
-        grade = 1
-    else:
-        grade = 0
+    grade = 1 if correct else 0
     return grade, correct, refusal
 
 
 
@@ -47,7 +47,7 @@ def majority_vote_accuracy_by_k(
         )
     plt.legend()  # bbox_to_anchor=(1.05, 0), loc='lower left')
     plt.grid(True, alpha=0.3)
-    # todo: avoid hardcoding out paths or make this an optional parameter
+    # TODO: avoid hardcoding out paths or make this an optional parameter
     plt.savefig(f"bixbench_results/majority_vote_accuracy_{name}.png")
     plt.show()
 
@@ -62,8 +62,8 @@ def plot_model_comparison(results, model1, model2):
     colors = {model1: "orange", model2: "#b3d9f2"}
 
     # Load baselines from JSON file
-    # todo: avoid hardcoding out paths or make this an optional parameter
-    with open("bixbench_results/zero_shot_baselines.json", "r") as f:
+    # TODO: avoid hardcoding out paths or make this an optional parameter
+    with open("bixbench_results/zero_shot_baselines.json") as f:
         baselines = json.load(f)
     # Draw baseline lines
     draw_baselines(x, baselines, barWidth)
 
@@ -1,22 +1,24 @@
-import asyncio
 import ast
-import pandas as pd
-import nbformat
+import asyncio
 import json
+import operator
 
-from fhda.utils import view_notebook
-import postprocessing_utils as utils
+import nbformat
+import pandas as pd
 import plotting_utils
+import postprocessing_utils as utils
+from fhda.utils import view_notebook
 
 pd.options.mode.chained_assignment = None
 
+
 def load_raw_data(path: str):
     """
     Load raw data from a CSV file and process specific columns.
-    
+
     Args:
         path (str): Path to the CSV file containing raw data
-        
+
     Returns:
         pd.DataFrame: Processed DataFrame with converted column types
     """
@@ -37,7 +39,7 @@ def load_raw_data(path: str):
             df[col] = df[col].apply(func)
 
     # Convert json notebook to markdown for postprocessing
-    if "nb" in df.columns and not "nb_md" in df.columns:
+    if "nb" in df.columns and "nb_md" not in df.columns:
         df_md = pd.DataFrame(
             df["nb"].apply(lambda x: view_notebook(x.cells, "python")).tolist(),
             columns=["md_notebook", "md_images"],
@@ -50,10 +52,10 @@ def load_raw_data(path: str):
 async def process_trajectories(df: pd.DataFrame):
     """
     Create a gradable dataframe from a raw dataframe of trajectories.
-    
+
     This function processes the raw data, runs evaluation loops, and saves
     the results to CSV files for further analysis.
-    
+
     Args:
         df (pd.DataFrame): Raw data containing model trajectories
     """
@@ -67,7 +69,7 @@ async def process_trajectories(df: pd.DataFrame):
     # Create correct column for open ended questions
     eval_df.loc[eval_df.question_format == "open", "correct"] = eval_df.loc[
         eval_df.question_format == "open", "llm_answer"
-    ].apply(lambda x: True if x == "1" else False)
+    ].apply(lambda x: x == "1")
     # Extract XML from LLM MCQ answers
     eval_df.loc[eval_df.question_format == "mcq", "llm_answer"] = eval_df.loc[
         eval_df.question_format == "mcq", "llm_answer"
@@ -85,7 +87,7 @@ async def process_trajectories(df: pd.DataFrame):
 async def run_majority_vote():
     """
     Implement majority voting evaluation across different model configurations.
-    
+
     This function reads evaluation data, performs majority voting analysis for
     multiple choice questions, and produces visualization comparing different model
     configurations with and without specific features.
@@ -106,7 +108,7 @@ async def run_majority_vote():
         grouped_df["llm_answer"] = grouped_df["llm_answer"].fillna("X")
         grouped_df = grouped_df.groupby("uuid").agg(list)
         grouped_df["correct_letter"] = grouped_df["correct_letter"].apply(
-            lambda x: x[0]
+            operator.itemgetter(0)
         )
         grouped_df = grouped_df.dropna()
         k_values, means, stds = utils.run_majority_voting(
@@ -143,7 +145,7 @@ async def run_majority_vote():
 async def compare_capsule_mode():
     """
     Compare performance between different model architectures.
-    
+
     This function analyzes and visualizes the performance differences between
     GPT-4o and Claude models across different question formats.
     """
@@ -175,10 +177,10 @@ async def compare_capsule_mode():
 def calculate_results(df):
     """
     Calculate means and confidence intervals for each model and format.
-    
+
     Args:
         df (pd.DataFrame): DataFrame containing model evaluation results
-        
+
     Returns:
         list: List of dictionaries containing statistical results for each model and format
     """
@@ -206,7 +208,7 @@ def calculate_results(df):
 async def compare_capsule_mode_with_refusal():
     """
     Compare models with refusal mode enabled.
-    
+
     This function loads evaluation data, processes it to compare how different models
     perform when the refusal option is available, and visualizes the results.
     """
@@ -219,10 +221,14 @@ async def compare_capsule_mode_with_refusal():
 
     # Filter to include only runs with refusal option enabled
     tmp = tmp[tmp.run_name.str.contains("with_refusal")]
-    
+
     tmp["model"] = tmp["run_name"].apply(lambda x: model1 if "4o" in x else model2)
-    tmp["vision"] = tmp["run_name"].apply(lambda x: "With Vision" if "image" in x and "no_image" not in x else "Without Vision")
-    
+    tmp["vision"] = tmp["run_name"].apply(
+        lambda x: (
+            "With Vision" if "image" in x and "no_image" not in x else "Without Vision"
+        )
+    )
+
     # Calculate means and confidence intervals
     results = calculate_results_for_refusal(tmp)
     print(results)
@@ -234,10 +240,10 @@ async def compare_capsule_mode_with_refusal():
 def calculate_results_for_refusal(df):
     """
     Calculate means and confidence intervals for refusal mode comparison.
-    
+
     Args:
         df (pd.DataFrame): DataFrame containing model evaluation results
-        
+
     Returns:
         list: List of dictionaries containing statistical results for each model and vision mode
     """
@@ -268,4 +274,4 @@ def calculate_results_for_refusal(df):
     asyncio.run(process_trajectories(data))
     asyncio.run(run_majority_vote())
     asyncio.run(compare_capsule_mode())
-    asyncio.run(compare_capsule_mode_with_refusal())
+    asyncio.run(compare_capsule_mode_with_refusal())