diff --git a/.gitignore b/.gitignore index c1fb058f0..6e0497a58 100644 --- a/.gitignore +++ b/.gitignore @@ -17,6 +17,7 @@ poetry.lock /site uv*/ uv.lock +local/ # Build and Distribution Files /dist diff --git a/openjudge/evaluation_strategy/voting_evaluation_strategy.py b/openjudge/evaluation_strategy/voting_evaluation_strategy.py index b3f76b3f0..0dfc19a87 100644 --- a/openjudge/evaluation_strategy/voting_evaluation_strategy.py +++ b/openjudge/evaluation_strategy/voting_evaluation_strategy.py @@ -69,7 +69,7 @@ async def execute(self, call_fn: Callable[..., Awaitable[Any]], **kwargs: Any) - values = [result.score for result in results if hasattr(result, "score")] if len(values) == 0: raise ValueError( - "VotingEvaluationStrategy only supports GraderScore." + "VotingEvaluationStrategy only supports GraderScore. " "No results were returned from the evaluation correctly." ) diff --git a/openjudge/graders/base_grader.py b/openjudge/graders/base_grader.py index 2159b44d3..c5bd6326c 100644 --- a/openjudge/graders/base_grader.py +++ b/openjudge/graders/base_grader.py @@ -160,9 +160,9 @@ async def aevaluate(self, executor: BaseResourceExecutor | None = None, **kwargs async def managed_fn(**runtime_kwargs): # Submit to executor for execution # pylint: disable=protected-access - # Create a shallow copy of the grader to prevent top-level state modification. + # Create a deep copy of the grader to prevent top-level state modification. if self.strategy: - runtime_self = self.copy() + runtime_self = copy.deepcopy(self) else: runtime_self = self @@ -273,46 +273,3 @@ def to_dict(self) -> dict: "description": self.description, "kwargs": self.kwargs, } - - def copy(self): - """Create a copy of this grader for evaluation to prevent state sharing between samples. - - This method is called by the runner to create an isolated instance of the grader - for each evaluation to prevent state pollution. By default, it attempts to create - a new instance with the same parameters, but subclasses can override this to - provide more specific behavior, especially when dealing with non-serializable - objects like model connections. - - Returns: - BaseGrader: A new instance of the grader with the same configuration - """ - - # # Get the class of this grader - # grader_class = self.__class__ - - # # Get constructor parameters by inspecting the grader's __init__ signature - # sig = inspect.signature(grader_class.__init__) - # init_params = {} - - # for param_name in sig.parameters: - # if param_name in ("self", "args", "kwargs"): # Skip special params - # continue - # if hasattr(self, param_name) or param_name in self.__dict__: - # # Get value from instance, defaulting to the parameter's default if available - # param_default = sig.parameters[param_name].default - # if param_default is not inspect.Parameter.empty: - # init_params[param_name] = getattr(self, param_name, param_default) - # else: - # init_params[param_name] = self.__dict__.get(param_name, getattr(self, param_name, None)) - - # # Create new instance with preserved parameters - # copied_grader = grader_class(**init_params) - - # # Copy over any remaining attributes that weren't part of __init__ - # for attr_name, attr_value in self.__dict__.items(): - # if attr_name not in init_params and not attr_name.startswith("_"): - # setattr(copied_grader, attr_name, attr_value) - - # return copied_grader - - return copy.copy(self) diff --git a/openjudge/models/base_chat_model.py b/openjudge/models/base_chat_model.py index 70ad62f55..8d14d8abd 100644 --- a/openjudge/models/base_chat_model.py +++ b/openjudge/models/base_chat_model.py @@ -5,6 +5,7 @@ interface for different LLM services such as OpenAI, DashScope, etc. """ +import copy from abc import ABC, abstractmethod from typing import Any, AsyncGenerator @@ -144,3 +145,33 @@ def _validate_tool_choice( raise ValueError( f"Invalid tool_choice '{tool_choice}'. " f"Available options: {', '.join(all_options)}", ) + + def __deepcopy__(self, memo: dict) -> "BaseChatModel": + """Deepcopy the chat model. + This method is used to create a deep copy of the chat model. + Args: + memo: A dictionary used by the `copy` module to manage object references + and prevent infinite recursion. + Returns: + BaseChatModel: A deep copy of the chat model. + Example: + >>> class MyChatModel(BaseChatModel): + ... async def achat(self, *args, **kwargs): + ... pass + >>> model = MyChatModel(model="test", stream=False) + >>> model_copy = copy.deepcopy(model) + """ + # 1. Create a new instance without initializing the client + cls = self.__class__ + new_obj = cls.__new__(cls) + memo[id(self)] = new_obj + + # 2. Deep copy other regular attributes + for k, v in self.__dict__.items(): + if k == "client": + # Inherit the async client for reuse + setattr(new_obj, k, v) + else: + setattr(new_obj, k, copy.deepcopy(v, memo)) + + return new_obj diff --git a/openjudge/runner/grading_runner.py b/openjudge/runner/grading_runner.py index f39b59a28..5a6cb3baf 100644 --- a/openjudge/runner/grading_runner.py +++ b/openjudge/runner/grading_runner.py @@ -11,6 +11,7 @@ """ import asyncio +import copy from dataclasses import dataclass from typing import Any, Callable, Dict, List, Tuple, Union @@ -238,7 +239,7 @@ async def _arun( try: data = parse_data_with_mapper(data, mapper) # Create an isolated grader instance for this evaluation to prevent state sharing - isolated_grader = grader.copy() + isolated_grader = copy.deepcopy(grader) # The grader itself handles the mapping internally return await isolated_grader.aevaluate(executor=executor, **data) diff --git a/pyproject.toml b/pyproject.toml index 2cc05596d..bf5b3264f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -47,7 +47,7 @@ dependencies = [ "math-verify>=0.7.0,<0.8.0", "tqdm>=4.66.0,<5.0.0", "fire", - "numpy>=1.22.0,<2.0.0", + "numpy>=1.22.0", "dashscope>=1.19.0", "tiktoken>=0.7.0", "nltk>=3.8.1", diff --git a/tests/evaluation_strategy/test_voting_evaluation_strategy.py b/tests/evaluation_strategy/test_voting_evaluation_strategy.py index b2a7f1a22..c9a83622a 100644 --- a/tests/evaluation_strategy/test_voting_evaluation_strategy.py +++ b/tests/evaluation_strategy/test_voting_evaluation_strategy.py @@ -108,7 +108,7 @@ async def mock_call_fn(): with pytest.raises( ValueError, - match="VotingEvaluationStrategy only supports GraderScore." + match="VotingEvaluationStrategy only supports GraderScore. " "No results were returned from the evaluation correctly.", ): await strategy.execute(mock_call_fn)