Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ poetry.lock
/site
uv*/
uv.lock
local/

# Build and Distribution Files
/dist
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,7 @@ async def execute(self, call_fn: Callable[..., Awaitable[Any]], **kwargs: Any) -
values = [result.score for result in results if hasattr(result, "score")]
if len(values) == 0:
raise ValueError(
"VotingEvaluationStrategy only supports GraderScore."
"VotingEvaluationStrategy only supports GraderScore. "
"No results were returned from the evaluation correctly."
)

Expand Down
47 changes: 2 additions & 45 deletions openjudge/graders/base_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,9 +160,9 @@ async def aevaluate(self, executor: BaseResourceExecutor | None = None, **kwargs
async def managed_fn(**runtime_kwargs):
# Submit to executor for execution
# pylint: disable=protected-access
# Create a shallow copy of the grader to prevent top-level state modification.
# Create a deep copy of the grader to prevent top-level state modification.
if self.strategy:
runtime_self = self.copy()
runtime_self = copy.deepcopy(self)
else:
runtime_self = self

Expand Down Expand Up @@ -273,46 +273,3 @@ def to_dict(self) -> dict:
"description": self.description,
"kwargs": self.kwargs,
}

def copy(self):
"""Create a copy of this grader for evaluation to prevent state sharing between samples.

This method is called by the runner to create an isolated instance of the grader
for each evaluation to prevent state pollution. By default, it attempts to create
a new instance with the same parameters, but subclasses can override this to
provide more specific behavior, especially when dealing with non-serializable
objects like model connections.

Returns:
BaseGrader: A new instance of the grader with the same configuration
"""

# # Get the class of this grader
# grader_class = self.__class__

# # Get constructor parameters by inspecting the grader's __init__ signature
# sig = inspect.signature(grader_class.__init__)
# init_params = {}

# for param_name in sig.parameters:
# if param_name in ("self", "args", "kwargs"): # Skip special params
# continue
# if hasattr(self, param_name) or param_name in self.__dict__:
# # Get value from instance, defaulting to the parameter's default if available
# param_default = sig.parameters[param_name].default
# if param_default is not inspect.Parameter.empty:
# init_params[param_name] = getattr(self, param_name, param_default)
# else:
# init_params[param_name] = self.__dict__.get(param_name, getattr(self, param_name, None))

# # Create new instance with preserved parameters
# copied_grader = grader_class(**init_params)

# # Copy over any remaining attributes that weren't part of __init__
# for attr_name, attr_value in self.__dict__.items():
# if attr_name not in init_params and not attr_name.startswith("_"):
# setattr(copied_grader, attr_name, attr_value)

# return copied_grader

return copy.copy(self)
31 changes: 31 additions & 0 deletions openjudge/models/base_chat_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
interface for different LLM services such as OpenAI, DashScope, etc.
"""

import copy
from abc import ABC, abstractmethod
from typing import Any, AsyncGenerator

Expand Down Expand Up @@ -144,3 +145,33 @@ def _validate_tool_choice(
raise ValueError(
f"Invalid tool_choice '{tool_choice}'. " f"Available options: {', '.join(all_options)}",
)

def __deepcopy__(self, memo: dict) -> "BaseChatModel":
"""Deepcopy the chat model.
This method is used to create a deep copy of the chat model.
Args:
memo: A dictionary used by the `copy` module to manage object references
and prevent infinite recursion.
Returns:
BaseChatModel: A deep copy of the chat model.
Example:
>>> class MyChatModel(BaseChatModel):
... async def achat(self, *args, **kwargs):
... pass
>>> model = MyChatModel(model="test", stream=False)
>>> model_copy = copy.deepcopy(model)
"""
# 1. Create a new instance without initializing the client
cls = self.__class__
new_obj = cls.__new__(cls)
memo[id(self)] = new_obj

# 2. Deep copy other regular attributes
for k, v in self.__dict__.items():
if k == "client":
# Inherit the async client for reuse
setattr(new_obj, k, v)
else:
setattr(new_obj, k, copy.deepcopy(v, memo))

return new_obj
3 changes: 2 additions & 1 deletion openjudge/runner/grading_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
"""

import asyncio
import copy
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Tuple, Union

Expand Down Expand Up @@ -238,7 +239,7 @@ async def _arun(
try:
data = parse_data_with_mapper(data, mapper)
# Create an isolated grader instance for this evaluation to prevent state sharing
isolated_grader = grader.copy()
isolated_grader = copy.deepcopy(grader)

# The grader itself handles the mapping internally
return await isolated_grader.aevaluate(executor=executor, **data)
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ dependencies = [
"math-verify>=0.7.0,<0.8.0",
"tqdm>=4.66.0,<5.0.0",
"fire",
"numpy>=1.22.0,<2.0.0",
"numpy>=1.22.0",
"dashscope>=1.19.0",
"tiktoken>=0.7.0",
"nltk>=3.8.1",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ async def mock_call_fn():

with pytest.raises(
ValueError,
match="VotingEvaluationStrategy only supports GraderScore."
match="VotingEvaluationStrategy only supports GraderScore. "
"No results were returned from the evaluation correctly.",
):
await strategy.execute(mock_call_fn)