diff --git a/.gitignore b/.gitignore
index c1fb058f0..6e0497a58 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,6 +17,7 @@ poetry.lock
 /site
 uv*/
 uv.lock
+local/
 
 # Build and Distribution Files
 /dist
diff --git a/openjudge/evaluation_strategy/voting_evaluation_strategy.py b/openjudge/evaluation_strategy/voting_evaluation_strategy.py
index b3f76b3f0..0dfc19a87 100644
--- a/openjudge/evaluation_strategy/voting_evaluation_strategy.py
+++ b/openjudge/evaluation_strategy/voting_evaluation_strategy.py
@@ -69,7 +69,7 @@ async def execute(self, call_fn: Callable[..., Awaitable[Any]], **kwargs: Any) -
         values = [result.score for result in results if hasattr(result, "score")]
         if len(values) == 0:
             raise ValueError(
-                "VotingEvaluationStrategy only supports GraderScore."
+                "VotingEvaluationStrategy only supports GraderScore. "
                 "No results were returned from the evaluation correctly."
             )
 
diff --git a/openjudge/graders/base_grader.py b/openjudge/graders/base_grader.py
index 2159b44d3..c5bd6326c 100644
--- a/openjudge/graders/base_grader.py
+++ b/openjudge/graders/base_grader.py
@@ -160,9 +160,9 @@ async def aevaluate(self, executor: BaseResourceExecutor | None = None, **kwargs
         async def managed_fn(**runtime_kwargs):
             # Submit to executor for execution
             # pylint: disable=protected-access
-            # Create a shallow copy of the grader to prevent top-level state modification.
+            # Create a deep copy of the grader to prevent top-level state modification.
             if self.strategy:
-                runtime_self = self.copy()
+                runtime_self = copy.deepcopy(self)
             else:
                 runtime_self = self
 
@@ -273,46 +273,3 @@ def to_dict(self) -> dict:
             "description": self.description,
             "kwargs": self.kwargs,
         }
-
-    def copy(self):
-        """Create a copy of this grader for evaluation to prevent state sharing between samples.
-
-        This method is called by the runner to create an isolated instance of the grader
-        for each evaluation to prevent state pollution. By default, it attempts to create
-        a new instance with the same parameters, but subclasses can override this to
-        provide more specific behavior, especially when dealing with non-serializable
-        objects like model connections.
-
-        Returns:
-            BaseGrader: A new instance of the grader with the same configuration
-        """
-
-        # # Get the class of this grader
-        # grader_class = self.__class__
-
-        # # Get constructor parameters by inspecting the grader's __init__ signature
-        # sig = inspect.signature(grader_class.__init__)
-        # init_params = {}
-
-        # for param_name in sig.parameters:
-        #     if param_name in ("self", "args", "kwargs"):  # Skip special params
-        #         continue
-        #     if hasattr(self, param_name) or param_name in self.__dict__:
-        #         # Get value from instance, defaulting to the parameter's default if available
-        #         param_default = sig.parameters[param_name].default
-        #         if param_default is not inspect.Parameter.empty:
-        #             init_params[param_name] = getattr(self, param_name, param_default)
-        #         else:
-        #             init_params[param_name] = self.__dict__.get(param_name, getattr(self, param_name, None))
-
-        # # Create new instance with preserved parameters
-        # copied_grader = grader_class(**init_params)
-
-        # # Copy over any remaining attributes that weren't part of __init__
-        # for attr_name, attr_value in self.__dict__.items():
-        #     if attr_name not in init_params and not attr_name.startswith("_"):
-        #         setattr(copied_grader, attr_name, attr_value)
-
-        # return copied_grader
-
-        return copy.copy(self)
diff --git a/openjudge/models/base_chat_model.py b/openjudge/models/base_chat_model.py
index 70ad62f55..8d14d8abd 100644
--- a/openjudge/models/base_chat_model.py
+++ b/openjudge/models/base_chat_model.py
@@ -5,6 +5,7 @@
 interface for different LLM services such as OpenAI, DashScope, etc.
 """
 
+import copy
 from abc import ABC, abstractmethod
 from typing import Any, AsyncGenerator
 
@@ -144,3 +145,33 @@ def _validate_tool_choice(
             raise ValueError(
                 f"Invalid tool_choice '{tool_choice}'. " f"Available options: {', '.join(all_options)}",
             )
+
+    def __deepcopy__(self, memo: dict) -> "BaseChatModel":
+        """Deepcopy the chat model.
+        This method is used to create a deep copy of the chat model.
+        Args:
+            memo: A dictionary used by the `copy` module to manage object references
+                  and prevent infinite recursion.
+        Returns:
+            BaseChatModel: A deep copy of the chat model.
+        Example:
+            >>> class MyChatModel(BaseChatModel):
+            ...     async def achat(self, *args, **kwargs):
+            ...         pass
+            >>> model = MyChatModel(model="test", stream=False)
+            >>> model_copy = copy.deepcopy(model)
+        """
+        # 1. Create a new instance without initializing the client
+        cls = self.__class__
+        new_obj = cls.__new__(cls)
+        memo[id(self)] = new_obj
+
+        # 2. Deep copy other regular attributes
+        for k, v in self.__dict__.items():
+            if k == "client":
+                # Inherit the async client for reuse
+                setattr(new_obj, k, v)
+            else:
+                setattr(new_obj, k, copy.deepcopy(v, memo))
+
+        return new_obj
diff --git a/openjudge/runner/grading_runner.py b/openjudge/runner/grading_runner.py
index f39b59a28..5a6cb3baf 100644
--- a/openjudge/runner/grading_runner.py
+++ b/openjudge/runner/grading_runner.py
@@ -11,6 +11,7 @@
 """
 
 import asyncio
+import copy
 from dataclasses import dataclass
 from typing import Any, Callable, Dict, List, Tuple, Union
 
@@ -238,7 +239,7 @@ async def _arun(
         try:
             data = parse_data_with_mapper(data, mapper)
             # Create an isolated grader instance for this evaluation to prevent state sharing
-            isolated_grader = grader.copy()
+            isolated_grader = copy.deepcopy(grader)
 
             # The grader itself handles the mapping internally
             return await isolated_grader.aevaluate(executor=executor, **data)
diff --git a/pyproject.toml b/pyproject.toml
index 2cc05596d..bf5b3264f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -47,7 +47,7 @@ dependencies = [
     "math-verify>=0.7.0,<0.8.0",
     "tqdm>=4.66.0,<5.0.0",
     "fire",
-    "numpy>=1.22.0,<2.0.0",
+    "numpy>=1.22.0",
     "dashscope>=1.19.0",
     "tiktoken>=0.7.0",
     "nltk>=3.8.1",
diff --git a/tests/evaluation_strategy/test_voting_evaluation_strategy.py b/tests/evaluation_strategy/test_voting_evaluation_strategy.py
index b2a7f1a22..c9a83622a 100644
--- a/tests/evaluation_strategy/test_voting_evaluation_strategy.py
+++ b/tests/evaluation_strategy/test_voting_evaluation_strategy.py
@@ -108,7 +108,7 @@ async def mock_call_fn():
 
         with pytest.raises(
             ValueError,
-            match="VotingEvaluationStrategy only supports GraderScore."
+            match="VotingEvaluationStrategy only supports GraderScore. "
             "No results were returned from the evaluation correctly.",
         ):
             await strategy.execute(mock_call_fn)