Skip to content

Commit b59d188

Browse files
authored
Ruff linter (#8)
* ruff linter and 2 unit tests * add ruff
1 parent c753fc5 commit b59d188

File tree

14 files changed

+502
-192
lines changed

14 files changed

+502
-192
lines changed

.github/workflows/tests.yml

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,13 +23,16 @@ jobs:
2323
- name: Install dependencies
2424
run: |
2525
python -m pip install --upgrade pip
26-
pip install autopep8 pytest numpy setuptools>=66 wheel>=0.36 build
26+
pip install ruff pytest numpy setuptools>=66 wheel>=0.36 build
2727
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
2828
if [ -f pyproject.toml ]; then pip install -e .; fi
2929
3030
- name: Run Lint
3131
run: |
32-
python -m autopep8 --diff --exit-code --recursive .
32+
# Check for linting issues
33+
ruff check .
34+
# Check for formatting issues (will fail if code needs formatting)
35+
ruff format --check .
3336
3437
test:
3538
runs-on: ubuntu-latest

bixbench/__init__.py

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,19 @@
1-
from .utils import randomize_choices, parse_response, EvalMode, AgentInput, LLMConfig
21
from .prompts import (
32
MCQ_PROMPT_TEMPLATE_WITH_REFUSAL,
43
MCQ_PROMPT_TEMPLATE_WITHOUT_REFUSAL,
54
OPEN_ENDED_PROMPT_TEMPLATE,
65
)
6+
from .utils import AgentInput, EvalMode, LLMConfig, parse_response, randomize_choices
77
from .zero_shot import ZeroshotBaseline
8-
from .graders import grade_mcq_answer, grade_open_ended_answer, compute_metrics
98

109
__all__ = [
11-
"randomize_choices",
12-
"parse_response",
13-
"EvalMode",
14-
"AgentInput",
15-
"LLMConfig",
16-
"MCQ_PROMPT_TEMPLATE_WITH_REFUSAL",
1710
"MCQ_PROMPT_TEMPLATE_WITHOUT_REFUSAL",
11+
"MCQ_PROMPT_TEMPLATE_WITH_REFUSAL",
1812
"OPEN_ENDED_PROMPT_TEMPLATE",
13+
"AgentInput",
14+
"EvalMode",
15+
"LLMConfig",
1916
"ZeroshotBaseline",
17+
"parse_response",
18+
"randomize_choices",
2019
]

bixbench/generate_traces.py

Lines changed: 6 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,18 @@
44
import logging
55
import shutil
66
from pathlib import Path
7-
import yaml
87

98
import datasets
9+
import yaml
10+
from aviary.utils import EvalAnswerMode
11+
from fhda import prompts
12+
from fhda.data_analysis_env import DataAnalysisEnv
13+
from fhda.utils import NBLanguage, collect_notebook_stats, load_mcq
1014
from huggingface_hub import hf_hub_download
1115
from ldp.agent import AgentConfig
1216
from ldp.alg.rollout import RolloutManager
1317
from ldp.data_structures import Trajectory
1418

15-
from fhda import prompts
16-
from fhda.data_analysis_env import DataAnalysisEnv
17-
from fhda.utils import NBLanguage, load_mcq, collect_notebook_stats
18-
from aviary.utils import EvalAnswerMode
19-
2019
logger = logging.getLogger(__name__)
2120

2221

@@ -105,7 +104,7 @@ async def load_bixbench(self) -> datasets.Dataset:
105104
return bixbench
106105

107106
def _extract_and_process_files(self, zip_path: Path, extract_dir: Path):
108-
"""Helper method to extract and process zip files"""
107+
"""Helper method to extract and process zip files."""
109108
# Extract the zip file
110109
shutil.unpack_archive(zip_path, extract_dir)
111110

bixbench/graders.py

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,9 @@
1-
from .prompts import OPEN_ENDED_GRADING_PROMPT
21
import re
2+
33
from aviary.core import Message
44

5+
from .prompts import OPEN_ENDED_GRADING_PROMPT
6+
57

68
def grade_mcq_answer(target, predicted, unsure):
79
predicted = predicted.upper()
@@ -13,10 +15,7 @@ def grade_mcq_answer(target, predicted, unsure):
1315
# Only for MCQ + w/resusal setting.Used to compute precision
1416
refusal = predicted != unsure
1517

16-
if correct:
17-
grade = 1
18-
else:
19-
grade = 0
18+
grade = 1 if correct else 0
2019
return grade, correct, refusal
2120

2221

bixbench/plotting_utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -47,7 +47,7 @@ def majority_vote_accuracy_by_k(
4747
)
4848
plt.legend() # bbox_to_anchor=(1.05, 0), loc='lower left')
4949
plt.grid(True, alpha=0.3)
50-
# todo: avoid hardcoding out paths or make this an optional parameter
50+
# TODO: avoid hardcoding out paths or make this an optional parameter
5151
plt.savefig(f"bixbench_results/majority_vote_accuracy_{name}.png")
5252
plt.show()
5353

@@ -62,8 +62,8 @@ def plot_model_comparison(results, model1, model2):
6262
colors = {model1: "orange", model2: "#b3d9f2"}
6363

6464
# Load baselines from JSON file
65-
# todo: avoid hardcoding out paths or make this an optional parameter
66-
with open("bixbench_results/zero_shot_baselines.json", "r") as f:
65+
# TODO: avoid hardcoding out paths or make this an optional parameter
66+
with open("bixbench_results/zero_shot_baselines.json") as f:
6767
baselines = json.load(f)
6868
# Draw baseline lines
6969
draw_baselines(x, baselines, barWidth)

bixbench/postprocessing.py

Lines changed: 29 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,22 +1,24 @@
1-
import asyncio
21
import ast
3-
import pandas as pd
4-
import nbformat
2+
import asyncio
53
import json
4+
import operator
65

7-
from fhda.utils import view_notebook
8-
import postprocessing_utils as utils
6+
import nbformat
7+
import pandas as pd
98
import plotting_utils
9+
import postprocessing_utils as utils
10+
from fhda.utils import view_notebook
1011

1112
pd.options.mode.chained_assignment = None
1213

14+
1315
def load_raw_data(path: str):
1416
"""
1517
Load raw data from a CSV file and process specific columns.
16-
18+
1719
Args:
1820
path (str): Path to the CSV file containing raw data
19-
21+
2022
Returns:
2123
pd.DataFrame: Processed DataFrame with converted column types
2224
"""
@@ -37,7 +39,7 @@ def load_raw_data(path: str):
3739
df[col] = df[col].apply(func)
3840

3941
# Convert json notebook to markdown for postprocessing
40-
if "nb" in df.columns and not "nb_md" in df.columns:
42+
if "nb" in df.columns and "nb_md" not in df.columns:
4143
df_md = pd.DataFrame(
4244
df["nb"].apply(lambda x: view_notebook(x.cells, "python")).tolist(),
4345
columns=["md_notebook", "md_images"],
@@ -50,10 +52,10 @@ def load_raw_data(path: str):
5052
async def process_trajectories(df: pd.DataFrame):
5153
"""
5254
Create a gradable dataframe from a raw dataframe of trajectories.
53-
55+
5456
This function processes the raw data, runs evaluation loops, and saves
5557
the results to CSV files for further analysis.
56-
58+
5759
Args:
5860
df (pd.DataFrame): Raw data containing model trajectories
5961
"""
@@ -67,7 +69,7 @@ async def process_trajectories(df: pd.DataFrame):
6769
# Create correct column for open ended questions
6870
eval_df.loc[eval_df.question_format == "open", "correct"] = eval_df.loc[
6971
eval_df.question_format == "open", "llm_answer"
70-
].apply(lambda x: True if x == "1" else False)
72+
].apply(lambda x: x == "1")
7173
# Extract XML from LLM MCQ answers
7274
eval_df.loc[eval_df.question_format == "mcq", "llm_answer"] = eval_df.loc[
7375
eval_df.question_format == "mcq", "llm_answer"
@@ -85,7 +87,7 @@ async def process_trajectories(df: pd.DataFrame):
8587
async def run_majority_vote():
8688
"""
8789
Implement majority voting evaluation across different model configurations.
88-
90+
8991
This function reads evaluation data, performs majority voting analysis for
9092
multiple choice questions, and produces visualization comparing different model
9193
configurations with and without specific features.
@@ -106,7 +108,7 @@ async def run_majority_vote():
106108
grouped_df["llm_answer"] = grouped_df["llm_answer"].fillna("X")
107109
grouped_df = grouped_df.groupby("uuid").agg(list)
108110
grouped_df["correct_letter"] = grouped_df["correct_letter"].apply(
109-
lambda x: x[0]
111+
operator.itemgetter(0)
110112
)
111113
grouped_df = grouped_df.dropna()
112114
k_values, means, stds = utils.run_majority_voting(
@@ -143,7 +145,7 @@ async def run_majority_vote():
143145
async def compare_capsule_mode():
144146
"""
145147
Compare performance between different model architectures.
146-
148+
147149
This function analyzes and visualizes the performance differences between
148150
GPT-4o and Claude models across different question formats.
149151
"""
@@ -175,10 +177,10 @@ async def compare_capsule_mode():
175177
def calculate_results(df):
176178
"""
177179
Calculate means and confidence intervals for each model and format.
178-
180+
179181
Args:
180182
df (pd.DataFrame): DataFrame containing model evaluation results
181-
183+
182184
Returns:
183185
list: List of dictionaries containing statistical results for each model and format
184186
"""
@@ -206,7 +208,7 @@ def calculate_results(df):
206208
async def compare_capsule_mode_with_refusal():
207209
"""
208210
Compare models with refusal mode enabled.
209-
211+
210212
This function loads evaluation data, processes it to compare how different models
211213
perform when the refusal option is available, and visualizes the results.
212214
"""
@@ -219,10 +221,14 @@ async def compare_capsule_mode_with_refusal():
219221

220222
# Filter to include only runs with refusal option enabled
221223
tmp = tmp[tmp.run_name.str.contains("with_refusal")]
222-
224+
223225
tmp["model"] = tmp["run_name"].apply(lambda x: model1 if "4o" in x else model2)
224-
tmp["vision"] = tmp["run_name"].apply(lambda x: "With Vision" if "image" in x and "no_image" not in x else "Without Vision")
225-
226+
tmp["vision"] = tmp["run_name"].apply(
227+
lambda x: (
228+
"With Vision" if "image" in x and "no_image" not in x else "Without Vision"
229+
)
230+
)
231+
226232
# Calculate means and confidence intervals
227233
results = calculate_results_for_refusal(tmp)
228234
print(results)
@@ -234,10 +240,10 @@ async def compare_capsule_mode_with_refusal():
234240
def calculate_results_for_refusal(df):
235241
"""
236242
Calculate means and confidence intervals for refusal mode comparison.
237-
243+
238244
Args:
239245
df (pd.DataFrame): DataFrame containing model evaluation results
240-
246+
241247
Returns:
242248
list: List of dictionaries containing statistical results for each model and vision mode
243249
"""
@@ -268,4 +274,4 @@ def calculate_results_for_refusal(df):
268274
asyncio.run(process_trajectories(data))
269275
asyncio.run(run_majority_vote())
270276
asyncio.run(compare_capsule_mode())
271-
asyncio.run(compare_capsule_mode_with_refusal())
277+
asyncio.run(compare_capsule_mode_with_refusal())

0 commit comments

Comments
 (0)