Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cookbooks/grader_validation/rewardbench2.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,10 @@ async def _evaluate_four_way(
GraderScore: Result with score=1.0 if predicted best answer matches ground truth
"""
# Handle None case for mutable arguments
answers = answers if answers is not None else []
chosen_indices = chosen_indices if chosen_indices is not None else []
if not answers:
answers = []
if not chosen_indices:
chosen_indices = []

# Ensure we have exactly 4 answers
if len(answers) < 4:
Expand Down Expand Up @@ -402,8 +404,10 @@ async def _evaluate_ties(
GraderScore: Result with score=1.0 if any top-rated answer is in chosen_indices
"""
# Handle None case for mutable arguments
answers = answers if answers is not None else []
chosen_indices = chosen_indices if chosen_indices is not None else []
if not answers:
answers = []
if not chosen_indices:
chosen_indices = []

correct_indices = set(chosen_indices)

Expand Down
2 changes: 1 addition & 1 deletion cookbooks/training_judge_model/bradley-terry/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _tokenize_messages(self, messages: List[Dict[str, str]]) -> Dict[str, torch.
# Handle sequence length like SFT dataset
if sequence_length < self.max_length:
# Pad sequences
pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else 0
padded_input_ids = (
torch.ones(
size=(self.max_length - sequence_length,),
Expand Down
13 changes: 6 additions & 7 deletions cookbooks/training_judge_model/bradley-terry/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _build_model_optimizer(self):
verbose=True,
)

if self.config.model.external_lib is not None:
if self.config.model.external_lib:
import importlib

importlib.import_module(self.config.model.external_lib)
Expand Down Expand Up @@ -538,11 +538,10 @@ def fit(self):
last_valid_metric = None
latest_train_metric = {}

total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
if self.config.trainer.total_training_steps is not None:
total_training_steps = self.config.trainer.total_training_steps

self.total_training_steps = total_training_steps
if self.config.trainer.total_training_steps:
self.total_training_steps = self.config.trainer.total_training_steps
else:
self.total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
print(f"Total training steps: {self.total_training_steps}")

# Create a single progress bar for all training steps
Expand Down Expand Up @@ -690,7 +689,7 @@ def run_bt_training(config):
)

# Ensure pad token exists
if tokenizer.pad_token is None:
if not tokenizer.pad_token:
tokenizer.pad_token = tokenizer.eos_token

# Create datasets
Expand Down
33 changes: 18 additions & 15 deletions openjudge/analyzer/statistical/consistency_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,21 +101,24 @@ def analyze(
>>> print(f"Consistency: {result.consistency:.2f}")
Consistency: 0.99
"""
# Handle the case where the method is called with the old signature
# i.e., analyze(first_run_results, second_run_results)
first_run_results = grader_results
second_run_results = another_grader_results

# If the parameters were passed positionally as before, dataset will be first_run_results
# and grader_results will be second_run_results
if first_run_results is None and second_run_results is None:
if dataset is not None and grader_results is not None:
first_run_results = dataset
second_run_results = grader_results
else:
# If still not set, use empty lists
first_run_results = []
second_run_results = []
# Need to support old 2-argment call signagure: analyze(first_run_results, second_run_results)
# Need to determine which argment is the 1st run result and which is the 2nd run result.
if grader_results and another_grader_results:
# current call signature
first_run_results = grader_results
second_run_results = another_grader_results
elif dataset and grader_results:
# The first two argments contain values but the 3rd does not.
# Treat this as a call following the old 2-argument signature.
first_run_results = dataset
second_run_results = grader_results
else:
# 1. Insufficient argments for the current call signature:
# dataset and another grader result exist,
# but the 2nd argment (grader result) does not have value.
# Or 2. none of dataset, grader_results, another_grader_results exists.
first_run_results = []
second_run_results = []

if not first_run_results or not second_run_results:
logger.warning(
Expand Down
2 changes: 1 addition & 1 deletion openjudge/generator/iterative_rubric/categorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ async def categorize_rubrics(
>>> categorized_rubrics, info = await categorizer.categorize_rubrics(rubrics)
"""

if len(rubrics) == 0:
if not rubrics:
logger.error("Input rubrics list is empty")
return [], {
"categorization_successful": False,
Expand Down
2 changes: 1 addition & 1 deletion openjudge/generator/iterative_rubric/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ async def generate(
grader_kwargs["max_score"] = self.config.max_score

# Add custom template if provided
if hasattr(self.config, "custom_evaluation_prompt") and self.config.custom_evaluation_prompt is not None:
if hasattr(self.config, "custom_evaluation_prompt") and self.config.custom_evaluation_prompt:
grader_kwargs["template"] = self.config.custom_evaluation_prompt

return LLMGrader(**grader_kwargs)
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/action/action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,10 +185,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate action alignment with plan",
model=model,
template=template,
template=template or DEFAULT_ACTION_ALIGNMENT_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_ACTION_ALIGNMENT_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/memory/memory_detail_preservation.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate memory detail preservation",
model=model,
template=template,
template=template or DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate memory retrieval effectiveness",
model=model,
template=template,
template=template or DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/plan/plan_feasibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate plan feasibility",
model=model,
template=template,
template=template or DEFAULT_PLAN_FEASIBILITY_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_PLAN_FEASIBILITY_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/reflection/reflection_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,10 +176,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate reflection accuracy",
model=model,
template=template,
template=template or DEFAULT_REFLECTION_ACCURACY_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_REFLECTION_ACCURACY_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -300,10 +300,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate reflection outcome understanding",
model=model,
template=template,
template=template or DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,10 +217,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate reflection progress awareness",
model=model,
template=template,
template=template or DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/tool/tool_call_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,10 +209,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluates the accuracy of tool calls made by an agent",
model=model,
template=template,
template=template or DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE

def _parse_tools_from_response(
self,
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/tool/tool_parameter_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,10 +189,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate tool parameter extraction correctness",
model=model,
template=template,
template=template or DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE

async def aevaluate(
self,
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/agent/tool/tool_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,10 +202,9 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate tool selection ",
model=model,
template=template,
template=template or DEFAULT_TOOL_SELECTION_TEMPLATE,
language=language,
)
self.template = template if template is not None else DEFAULT_TOOL_SELECTION_TEMPLATE

async def aevaluate(
self,
Expand Down
Empty file.
7 changes: 4 additions & 3 deletions openjudge/graders/code/_utils/testing_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ def run_test(in_outs, test=None, timeout=15):
"""
# Disable functionalities that can make destructive changes to the test.
reliability_guard()

if not test:
raise AssertionError("should not happen: missing test code input")

method_name = None
tmp = None
which_type = None
Expand All @@ -170,9 +174,6 @@ def run_test(in_outs, test=None, timeout=15):

logger.debug(f"loaded input_output = {datetime.now().time()}")

if test is None:
raise AssertionError("should not happen: test code is none")

results = []
sol = """from string import *
from re import *
Expand Down
3 changes: 1 addition & 2 deletions openjudge/graders/common/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,11 +265,10 @@ def __init__(
mode=GraderMode.POINTWISE,
description="Evaluate whether response contains hallucinations",
model=model,
template=template,
template=template or DEFAULT_HALLUCINATION_TEMPLATE,
language=language,
)
self.threshold = threshold
self.template = template if template is not None else DEFAULT_HALLUCINATION_TEMPLATE

async def aevaluate(
self,
Expand Down
6 changes: 3 additions & 3 deletions openjudge/graders/multimodal/_internal/criteria_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def validate_and_sort_rubrics(
... ]
>>> sorted_rubrics = validate_and_sort_rubrics(rubrics)
"""
if rubrics is None:
if not rubrics:
return None

# Sort rubrics by start of range
Expand Down Expand Up @@ -120,7 +120,7 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
0-3: Poor quality
7-10: High quality
"""
if rubrics is None:
if not rubrics:
return None

return "\n".join(
Expand Down Expand Up @@ -177,7 +177,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
>>> get_score_range(rubrics)
(0, 10)
"""
if rubric is None:
if not rubric:
return (0, 10)

return rubric[0].score_range[0], rubric[-1].score_range[1]
Expand Down
8 changes: 4 additions & 4 deletions openjudge/models/formatter/dashscope_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def _convert_content_to_openai(
Returns:
Content in OpenAI format.
"""
# If content is None, return empty string
if content is None:
# Return empty string if no content input
if not content:
return ""

# If content is a string, return as is
Expand Down Expand Up @@ -135,8 +135,8 @@ def _convert_content_to_dashscope(
Returns:
Content in DashScope format.
"""
# If content is None, return empty string
if content is None:
# Return empty string if no content input
if not content:
return ""

# If content is a string, return as is
Expand Down
2 changes: 1 addition & 1 deletion openjudge/models/schema/prompt_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ def to_messages(
if isinstance(self.messages, list):
messages = self.messages
elif isinstance(self.messages, dict):
if language is None:
if not language:
language = LanguageEnum.EN
assert language in self.messages
messages = self.messages.get(language, [])
Expand Down
4 changes: 2 additions & 2 deletions openjudge/runner/aggregator/base_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,12 @@ def __name__(self):
return self.name

@abstractmethod
def __call__(self, results: Dict[str, GraderResult], **kwargs) -> GraderResult:
def __call__(self, grader_results: Dict[str, GraderResult], **kwargs) -> GraderResult:
"""
Aggregate results from multiple graders for a single sample.
Args:
results: Dictionary mapping grader names to GraderResult objects for a single sample
grader_results: Dictionary mapping grader names to GraderResult objects for a single sample
**kwargs: Additional arguments for aggregation
Returns:
Expand Down
18 changes: 9 additions & 9 deletions openjudge/runner/aggregator/weighted_sum_aggregator.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,27 +28,27 @@ def __init__(self, name: str, weights: Dict[str, float] = None):
super().__init__(name)
self.weights = weights or {}

def __call__(self, results: Dict[str, GraderResult], **kwargs) -> GraderResult:
def __call__(self, grader_results: Dict[str, GraderResult], **kwargs) -> GraderResult:
"""
Aggregate results using weighted sum for a single sample.
Aggregate multiple grader results using weighted sum for a single sample.

Args:
results: Dictionary mapping grader names to GraderResult objects for a single sample
grader_results: Dictionary mapping grader names to GraderResult objects for a single sample
**kwargs: Additional arguments (unused)

Returns:
Aggregated result as a GraderResult object
"""
if not results:
if not grader_results:
return GraderError(
name=self.name,
reason="No results to aggregate",
error="No results provided for aggregation",
reason="No grader result to aggregate",
error="No grader result provided for aggregation",
)

# Initialize weights if not provided (equal weights)
if not self.weights:
grader_names = list(results.keys())
grader_names = list(grader_results.keys())
equal_weight = 1.0 / len(grader_names) if grader_names else 0.0
weights = {name: equal_weight for name in grader_names}
else:
Expand All @@ -59,8 +59,8 @@ def __call__(self, results: Dict[str, GraderResult], **kwargs) -> GraderResult:
component_scores = {}

# Collect scores from all graders for this sample
for grader_name, result in results.items():
# Only process GraderScore results (skip errors, ranks, etc.)
for grader_name, result in grader_results.items():
# Only process results of GraderScore type (skip errors, ranks, etc.)
if isinstance(result, GraderScore):
weight = weights.get(grader_name, 0.0)
weighted_sum += result.score * weight
Expand Down
Loading