Skip to content
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 8 additions & 4 deletions cookbooks/grader_validation/rewardbench2.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,8 +307,10 @@ async def _evaluate_four_way(
GraderScore: Result with score=1.0 if predicted best answer matches ground truth
"""
# Handle None case for mutable arguments
answers = answers if answers is not None else []
chosen_indices = chosen_indices if chosen_indices is not None else []
if not answers:
answers = []
if not chosen_indices:
chosen_indices = []

# Ensure we have exactly 4 answers
if len(answers) < 4:
Expand Down Expand Up @@ -402,8 +404,10 @@ async def _evaluate_ties(
GraderScore: Result with score=1.0 if any top-rated answer is in chosen_indices
"""
# Handle None case for mutable arguments
answers = answers if answers is not None else []
chosen_indices = chosen_indices if chosen_indices is not None else []
if not answers:
answers = []
if not chosen_indices:
chosen_indices = []

correct_indices = set(chosen_indices)

Expand Down
2 changes: 1 addition & 1 deletion cookbooks/training_judge_model/bradley-terry/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ def _tokenize_messages(self, messages: List[Dict[str, str]]) -> Dict[str, torch.
# Handle sequence length like SFT dataset
if sequence_length < self.max_length:
# Pad sequences
pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id is not None else 0
pad_token_id = self.tokenizer.pad_token_id if self.tokenizer.pad_token_id else 0
padded_input_ids = (
torch.ones(
size=(self.max_length - sequence_length,),
Expand Down
13 changes: 6 additions & 7 deletions cookbooks/training_judge_model/bradley-terry/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def _build_model_optimizer(self):
verbose=True,
)

if self.config.model.external_lib is not None:
if self.config.model.external_lib:
import importlib

importlib.import_module(self.config.model.external_lib)
Expand Down Expand Up @@ -538,11 +538,10 @@ def fit(self):
last_valid_metric = None
latest_train_metric = {}

total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
if self.config.trainer.total_training_steps is not None:
total_training_steps = self.config.trainer.total_training_steps

self.total_training_steps = total_training_steps
if self.config.trainer.total_training_steps:
self.total_training_steps = self.config.trainer.total_training_steps
else:
self.total_training_steps = len(self.train_dataloader) * self.config.trainer.total_epochs
print(f"Total training steps: {self.total_training_steps}")

# Create a single progress bar for all training steps
Expand Down Expand Up @@ -690,7 +689,7 @@ def run_bt_training(config):
)

# Ensure pad token exists
if tokenizer.pad_token is None:
if not tokenizer.pad_token:
tokenizer.pad_token = tokenizer.eos_token

# Create datasets
Expand Down
31 changes: 16 additions & 15 deletions openjudge/analyzer/statistical/consistency_analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,21 +101,22 @@ def analyze(
>>> print(f"Consistency: {result.consistency:.2f}")
Consistency: 0.99
"""
# Handle the case where the method is called with the old signature
# i.e., analyze(first_run_results, second_run_results)
first_run_results = grader_results
second_run_results = another_grader_results

# If the parameters were passed positionally as before, dataset will be first_run_results
# and grader_results will be second_run_results
if first_run_results is None and second_run_results is None:
if dataset is not None and grader_results is not None:
first_run_results = dataset
second_run_results = grader_results
else:
# If still not set, use empty lists
first_run_results = []
second_run_results = []
# Need to support old 2-argment call signagure: analyze(first_run_results, second_run_results)
# Need to determine which argment is the 1st run result and which is the 2nd run result.
if grader_results and another_grader_results:
# current call signature
first_run_results = grader_results
second_run_results = another_grader_results
elif dataset and grader_results:
# The first two argments contain values but the 3rd does not.
# Treat this as a call following the old 2-argument signature.
first_run_results = dataset
second_run_results = grader_results
else:
# Insufficient argments for the current call signature: dataset and another grader result exist, but the 2nd argment (grader result) does not have value.
# or none of dataset, grader_results, another_grader_results exists
first_run_results = []
second_run_results = []

if not first_run_results or not second_run_results:
logger.warning(
Expand Down
2 changes: 1 addition & 1 deletion openjudge/generator/iterative_rubric/categorizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ async def categorize_rubrics(
>>> categorized_rubrics, info = await categorizer.categorize_rubrics(rubrics)
"""

if len(rubrics) == 0:
if not rubrics:
logger.error("Input rubrics list is empty")
return [], {
"categorization_successful": False,
Expand Down
2 changes: 1 addition & 1 deletion openjudge/generator/iterative_rubric/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ async def generate(
grader_kwargs["max_score"] = self.config.max_score

# Add custom template if provided
if hasattr(self.config, "custom_evaluation_prompt") and self.config.custom_evaluation_prompt is not None:
if hasattr(self.config, "custom_evaluation_prompt") and self.config.custom_evaluation_prompt:
grader_kwargs["template"] = self.config.custom_evaluation_prompt

return LLMGrader(**grader_kwargs)
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/action/action_alignment.py
Original file line number Diff line number Diff line change
Expand Up @@ -180,15 +180,15 @@ def __init__(
Defaults to DEFAULT_ACTION_ALIGNMENT_TEMPLATE.
language: The language for the evaluation prompt. Defaults to LanguageEnum.EN.
"""
template_arg = template if template else DEFAULT_ACTION_ALIGNMENT_TEMPLATE
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

template_arg = template or DEFAULT_ACTION_ALIGNMENT_TEMPLATE

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Done. Applied to other locations too.

super().__init__(
name="action_alignment",
mode=GraderMode.POINTWISE,
description="Evaluate action alignment with plan",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_ACTION_ALIGNMENT_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/memory/memory_detail_preservation.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE
super().__init__(
name="memory_detail_preservation",
mode=GraderMode.POINTWISE,
description="Evaluate memory detail preservation",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_MEMORY_DETAIL_PRESERVATION_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE
super().__init__(
name="memory_retrieval_effectiveness",
mode=GraderMode.POINTWISE,
description="Evaluate memory retrieval effectiveness",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_MEMORY_RETRIEVAL_EFFECTIVENESS_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/plan/plan_feasibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,15 +174,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_PLAN_FEASIBILITY_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_PLAN_FEASIBILITY_TEMPLATE
super().__init__(
name="plan_feasibility",
mode=GraderMode.POINTWISE,
description="Evaluate plan feasibility",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_PLAN_FEASIBILITY_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/reflection/reflection_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,15 +171,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_REFLECTION_ACCURACY_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_REFLECTION_ACCURACY_TEMPLATE
super().__init__(
name="reflection_accuracy",
mode=GraderMode.POINTWISE,
description="Evaluate reflection accuracy",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_REFLECTION_ACCURACY_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -295,15 +295,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE
super().__init__(
name="reflection_outcome_understanding",
mode=GraderMode.POINTWISE,
description="Evaluate reflection outcome understanding",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_REFLECTION_OUTCOME_UNDERSTANDING_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -212,15 +212,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE
super().__init__(
name="reflection_progress_awareness",
mode=GraderMode.POINTWISE,
description="Evaluate reflection progress awareness",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_REFLECTION_PROGRESS_AWARENESS_TEMPLATE

def _format_history(self, history: Optional[list] = None) -> str:
"""Format history steps for evaluation.
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/tool/tool_call_accuracy.py
Original file line number Diff line number Diff line change
Expand Up @@ -204,15 +204,15 @@ def __init__(
template: Evaluation template. Defaults to DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE.
language: Language for evaluation prompts (default: LanguageEnum.EN).
"""
template_arg = template if template else DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE
super().__init__(
name="tool_call_accuracy",
mode=GraderMode.POINTWISE,
description="Evaluates the accuracy of tool calls made by an agent",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_TOOL_CALL_ACCURACY_TEMPLATE

def _parse_tools_from_response(
self,
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/tool/tool_parameter_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,15 +184,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE
super().__init__(
name="tool_parameter_check",
mode=GraderMode.POINTWISE,
description="Evaluate tool parameter extraction correctness",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_TOOL_PARAMETER_CHECK_TEMPLATE

async def aevaluate(
self,
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/agent/tool/tool_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,15 +197,15 @@ def __init__(
template: Optional[PromptTemplate] = DEFAULT_TOOL_SELECTION_TEMPLATE,
language: LanguageEnum = LanguageEnum.EN,
):
template_arg = template if template else DEFAULT_TOOL_SELECTION_TEMPLATE
super().__init__(
name="tool_selection",
mode=GraderMode.POINTWISE,
description="Evaluate tool selection ",
model=model,
template=template,
template=template_arg,
language=language,
)
self.template = template if template is not None else DEFAULT_TOOL_SELECTION_TEMPLATE

async def aevaluate(
self,
Expand Down
Empty file.
7 changes: 4 additions & 3 deletions openjudge/graders/code/_utils/testing_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,10 @@ def run_test(in_outs, test=None, timeout=15):
"""
# Disable functionalities that can make destructive changes to the test.
reliability_guard()

if not test:
raise AssertionError("should not happen: missing test code input")

method_name = None
tmp = None
which_type = None
Expand All @@ -170,9 +174,6 @@ def run_test(in_outs, test=None, timeout=15):

logger.debug(f"loaded input_output = {datetime.now().time()}")

if test is None:
raise AssertionError("should not happen: test code is none")

results = []
sol = """from string import *
from re import *
Expand Down
4 changes: 2 additions & 2 deletions openjudge/graders/common/hallucination.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,16 +260,16 @@ def __init__(
template: PromptTemplate for evaluation prompts (default: DEFAULT_HALLUCINATION_TEMPLATE)
language: Language for prompts (default: LanguageEnum.EN)
"""
template_arg = template if template else DEFAULT_HALLUCINATION_TEMPLATE
super().__init__(
name="hallucination",
mode=GraderMode.POINTWISE,
description="Evaluate whether response contains hallucinations",
model=model,
template=template,
template=template_arg,
language=language,
)
self.threshold = threshold
self.template = template if template is not None else DEFAULT_HALLUCINATION_TEMPLATE

async def aevaluate(
self,
Expand Down
6 changes: 3 additions & 3 deletions openjudge/graders/multimodal/_internal/criteria_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ def validate_and_sort_rubrics(
... ]
>>> sorted_rubrics = validate_and_sort_rubrics(rubrics)
"""
if rubrics is None:
if not rubrics:
return None

# Sort rubrics by start of range
Expand Down Expand Up @@ -120,7 +120,7 @@ def format_rubrics(rubrics: Optional[List[Rubric]]) -> Optional[str]:
0-3: Poor quality
7-10: High quality
"""
if rubrics is None:
if not rubrics:
return None

return "\n".join(
Expand Down Expand Up @@ -177,7 +177,7 @@ def get_score_range(rubric: Optional[List[Rubric]]) -> Tuple[int, int]:
>>> get_score_range(rubrics)
(0, 10)
"""
if rubric is None:
if not rubric:
return (0, 10)

return rubric[0].score_range[0], rubric[-1].score_range[1]
Expand Down
8 changes: 4 additions & 4 deletions openjudge/models/formatter/dashscope_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,8 +75,8 @@ def _convert_content_to_openai(
Returns:
Content in OpenAI format.
"""
# If content is None, return empty string
if content is None:
# Return empty string if no content input
if not content:
return ""

# If content is a string, return as is
Expand Down Expand Up @@ -135,8 +135,8 @@ def _convert_content_to_dashscope(
Returns:
Content in DashScope format.
"""
# If content is None, return empty string
if content is None:
# Return empty string if no content input
if not content:
return ""

# If content is a string, return as is
Expand Down
Loading