agentscope-ai · helloml0326 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026 · Jan 12, 2026
diff --git a/openjudge/graders/base_grader.py b/openjudge/graders/base_grader.py
@@ -6,7 +6,6 @@
 either scores or rankings.
 """
 
-# import inspect
 from abc import ABC, abstractmethod
 from typing import Any, Dict
 
@@ -60,7 +59,10 @@ def __init__(
                      accessible to subclasses.
 
         Example:
-            >>> grader = BaseGrader(
+            >>> class MyGrader(BaseGrader):
+            ...     async def aevaluate(self, **kwargs):
+            ...         pass
+            >>> grader = MyGrader(
             ...     name="accuracy_grader",
             ...     mode=GraderMode.POINTWISE,
             ...     description="Evaluates answer accuracy"
@@ -189,7 +191,12 @@ def from_config(
         # Extract standard grader properties from a copy to avoid mutating the input config
         config_copy = dict(config)
         name = config_copy.pop("name", "")
-        mode = config_copy.pop("mode", GraderMode.POINTWISE)
+        mode_value = config_copy.pop("mode", GraderMode.POINTWISE)
+        # Convert string to GraderMode if necessary
+        if isinstance(mode_value, str):
+            mode = GraderMode(mode_value)
+        else:
+            mode = mode_value
         description = config_copy.pop("description", "")
 
         # Create and return new instance with remaining config items as kwargs

diff --git a/openjudge/graders/code/__init__.py b/openjudge/graders/code/__init__.py
@@ -8,10 +8,10 @@
 extensible evaluation mechanisms for AI-generated content.
 """
 
-from open_judge.graders.code.code_excution import CodeExecutionGrader
-from open_judge.graders.code.code_style import CodeStyleGrader
-from open_judge.graders.code.patch_similarity import PatchSimilarityGrader
-from open_judge.graders.code.syntax_checker import SyntaxCheckGrader
+from .code_execution import CodeExecutionGrader
+from .code_style import CodeStyleGrader
+from .patch_similarity import PatchSimilarityGrader
+from .syntax_checker import SyntaxCheckGrader
 
 __all__ = [
     "CodeExecutionGrader",

diff --git a/openjudge/graders/code/code_excution.py → openjudge/graders/code/code_execution.py b/openjudge/graders/code/code_excution.py → openjudge/graders/code/code_execution.py
diff --git a/openjudge/graders/llm_grader.py b/openjudge/graders/llm_grader.py
@@ -133,10 +133,13 @@ def __init__(
             )
         elif isinstance(template, PromptTemplate):
             self.template = template
+        elif isinstance(template, list):
+            # Support list of message dicts or ChatMessage objects
+            self.template = PromptTemplate.from_prompt(template)
         elif isinstance(template, dict):
             self.template = PromptTemplate(**template)
         else:
-            raise ValueError("Template must be a str, dict or PromptTemplate object")
+            raise ValueError("Template must be a str, list, dict or PromptTemplate object")
 
         # Initialize model
         if isinstance(model, dict):

diff --git a/openjudge/graders/multimodal/__init__.py b/openjudge/graders/multimodal/__init__.py
@@ -8,10 +8,10 @@
 - Text-to-image generation quality
 """
 
-from openjudge.graders.multimodal._internal import MLLMImage
-from openjudge.graders.multimodal.image_coherence import ImageCoherenceGrader
-from openjudge.graders.multimodal.image_helpfulness import ImageHelpfulnessGrader
-from openjudge.graders.multimodal.text_to_image import TextToImageGrader
+from ._internal import MLLMImage
+from .image_coherence import ImageCoherenceGrader
+from .image_helpfulness import ImageHelpfulnessGrader
+from .text_to_image import TextToImageGrader
 
 __all__ = [
     # Graders

diff --git a/openjudge/graders/schema.py b/openjudge/graders/schema.py
@@ -142,9 +142,10 @@ class GraderRank(GraderResult):
 
 
 class GraderRankCallback(BaseModel):
-    """Callback for grader rank result, used for .
+    """Callback schema for LLM structured output in listwise grading.
 
-    Represents a ranking of items assigned by a grader along with a reason.
+    Used as the structured_model parameter in LLMGrader for LISTWISE mode.
+    The LLM returns this schema which is then converted to GraderRank.
 
     Attributes:
         rank (List[int]): The ranking of items.

diff --git a/openjudge/models/base_chat_model.py b/openjudge/models/base_chat_model.py
@@ -29,7 +29,7 @@ class BaseChatModel(ABC):
         ...     async def achat(self, *args, **kwargs):
         ...         # Implementation here
         ...         pass
-        >>> model = MyChatModel(model="qwen3-max", stream=False)
+        >>> model = MyChatModel(model="qwen3-32b", stream=False)
         >>> print(model.model)
         qwen3-32b
     """
@@ -52,11 +52,12 @@ def __init__(
             stream: Whether the model output is streaming or not.
 
         Example:
-            >>> model = BaseChatModel(model="qwen3-32b", stream=True)
+            >>> class MyChatModel(BaseChatModel):
+            ...     async def achat(self, *args, **kwargs):
+            ...         pass
+            >>> model = MyChatModel(model="qwen3-32b", stream=True)
             >>> print(model.model)
             qwen3-32b
-            >>> print(model.stream)
-            True
         """
         self.model = model
         self.stream = stream
@@ -102,9 +103,11 @@ def _validate_tool_choice(
             ValueError: If tool_choice is invalid.
 
         Example:
-            >>> model = BaseChatModel(model="test", stream=False)
+            >>> class MyChatModel(BaseChatModel):
+            ...     async def achat(self, *args, **kwargs):
+            ...         pass
+            >>> model = MyChatModel(model="test", stream=False)
             >>> model._validate_tool_choice("auto", None)  # Valid
-            >>> # model._validate_tool_choice(123, None)  # Would raise TypeError
         """
         if not isinstance(tool_choice, str):
             raise TypeError(

diff --git a/openjudge/models/formatter/dashscope_formatter.py b/openjudge/models/formatter/dashscope_formatter.py
@@ -83,7 +83,7 @@ def _convert_content_to_openai(
         if isinstance(content, str):
             return content
 
-        # If content is a list, process each part
+        # If content is a list, process each part (including empty list)
         if isinstance(content, list):
             openai_content = []
             for part in content:
@@ -143,7 +143,7 @@ def _convert_content_to_dashscope(
         if isinstance(content, str):
             return content
 
-        # If content is a list, process each part
+        # If content is a list, process each part (including empty list)
         if isinstance(content, list):
             dashscope_content = []
             for part in content:

diff --git a/openjudge/models/schema/prompt_template.py b/openjudge/models/schema/prompt_template.py
@@ -167,9 +167,8 @@ def to_messages(
             List[ChatMessage]: The messages for the specified language.
 
         Raises:
-            AssertionError: If the specified language is not available in a
-                multilingual template.
-            ValueError: If messages format is invalid.
+            ValueError: If the specified language is not available in a
+                multilingual template, or if messages format is invalid.
 
         Examples:
             >>> template = PromptTemplate(messages=[ChatMessage(role="user", content="Hello")])
@@ -182,16 +181,17 @@ def to_messages(
             [ChatMessage(role="user", content="Hello")]
         """
         if isinstance(self.messages, list):
-            messages = self.messages
-        elif isinstance(self.messages, dict):
+            return self.messages
+
+        if isinstance(self.messages, dict):
             if not language:
                 language = LanguageEnum.EN
-            assert language in self.messages
-            messages = self.messages.get(language, [])
-        else:
-            raise ValueError("Invalid messages")
+            if language not in self.messages:
+                available = [lang.value for lang in self.messages.keys()]
+                raise ValueError(f"Language '{language.value}' not found. Available: {available}")
+            return self.messages[language]
 
-        return messages
+        raise ValueError("Invalid messages format")
 
     @classmethod
     def from_prompt(cls, prompt: Prompt) -> "PromptTemplate":
@@ -280,7 +280,7 @@ def format(
         messages = [message.format(**kwargs).to_dict() for message in messages]
         return messages
 
-    def get_prompt(self, language: LanguageEnum = None) -> Dict[str, List[Dict[str, str]]]:
+    def get_prompt(self, language: LanguageEnum | None = None) -> Dict[str, List[Dict[str, str]]]:
         """Return the core prompts (role, content) information of the messages,
         in a {language: list[{'role': txt, 'content': txt}]} dictionary.
         """

diff --git a/openjudge/utils/concurrency.py b/openjudge/utils/concurrency.py
@@ -6,6 +6,9 @@
 """
 
 import asyncio
+from typing import Awaitable, TypeVar
+
+T = TypeVar("T")
 
 
 class ConcurrencyManager:
@@ -61,15 +64,15 @@ def get_max_concurrency(self) -> int:
         """
         return self._max_concurrency
 
-    async def run_with_concurrency_control(self, coro):
+    async def run_with_concurrency_control(self, coro: Awaitable[T]) -> T:
         """
         Run a coroutine with concurrency control.
 
         Args:
-            coro: The coroutine to run
+            coro: The coroutine to run.
 
         Returns:
-            The result of the coroutine
+            T: The result of the coroutine.
         """
         async with self._semaphore:
             return await coro

diff --git a/openjudge/utils/instance.py b/openjudge/utils/instance.py
@@ -64,7 +64,7 @@ class should be subclass of. If provided, will check
         >>> # From config dict
         >>> config = {
         ...     'class_name': 'StringMatchGrader',
-        ...     'module_path': 'openjudge.text.string_match',
+        ...     'module_path': 'openjudge.graders.text.string_match',
         ...     'kwargs': {'ignore_case': True}
         ... }
         >>> # instance = init_instance_by_config(config)
@@ -74,8 +74,8 @@ class should be subclass of. If provided, will check
         >>> # instance = init_instance_by_config(existing_instance)
         >>>
         >>> # With type checking
-        >>> # from openjudge.grader.base import Grader
-        >>> # instance = init_instance_by_config(config, accept_type=Grader)
+        >>> # from openjudge.graders.base_grader import BaseGrader
+        >>> # instance = init_instance_by_config(config, accept_type=BaseGrader)
     """
     # If config is already an instance, just check its type
     if not isinstance(config, dict):

diff --git a/openjudge/utils/tokenizer.py b/openjudge/utils/tokenizer.py
@@ -80,7 +80,7 @@ def preprocess_text(self, text: str, to_lower: bool = False) -> str:
             str: Preprocessed text.
 
         Example:
-            >>> tokenizer = BaseTokenizer(name="test")
+            >>> tokenizer = SimpleTokenizer()
             >>> result = tokenizer.preprocess_text("  Hello World  ", to_lower=True)
             >>> print(result)
             hello world
@@ -139,8 +139,11 @@ def tokenize(self, text: str) -> List[str]:
             # Convert token ids back to strings for comparison
             token_strings = [encoding.decode([token]) for token in tokens]
             return token_strings
-        except Exception:
-            # Fallback to simple splitting if tiktoken fails
+        except ImportError:
+            # Fallback to simple splitting if tiktoken not installed
+            return text.split()
+        except KeyError:
+            # Fallback if encoding name is invalid
             return text.split()
 
 

diff --git a/openjudge/utils/utils.py b/openjudge/utils/utils.py
@@ -40,7 +40,8 @@ def repair_and_load_json(
     repaired = json_str
     try:
         repaired = repair_json(json_str)
-    except Exception:
+    except (ValueError, TypeError):
+        # repair_json may fail on malformed input, keep original string
         pass
 
     try:
@@ -136,8 +137,7 @@ def create_tool_from_base_model(
     Note:
         The function automatically removes the 'title' field from
         the JSON schema to ensure compatibility with function calling
-        format. This is handled by the internal [_remove_title_field]
-        (file://.openjudge/utils/utils.py#L33-L55) function.
+        format. This is handled by the internal `_remove_title_field` function.
     """
     schema = structured_model.model_json_schema()
 
@@ -200,5 +200,6 @@ def trim_and_load_json(response: str, metric: Any = None) -> Dict[str, Any]:
     except json.JSONDecodeError as e:
         error_msg = f"Failed to parse JSON from response: {e}\nResponse: {response[:200]}"
         if metric:
-            logger.error(f"{metric.name}: {error_msg}")
+            metric_name = getattr(metric, "name", "unknown_metric")
+            logger.error(f"{metric_name}: {error_msg}")
         raise ValueError(error_msg) from e
diff --git a/tests/graders/test_llm_grader.py b/tests/graders/test_llm_grader.py
@@ -67,7 +67,7 @@ def test_initialization_failure_without_template(self):
                 model=AsyncMock(),
                 name="foo",
             )
-        assert "Template must be a str, dict or PromptTemplate object" in str(error_obj.value)
+        assert "Template must be a str, list, dict or PromptTemplate object" in str(error_obj.value)
 
     def test_initialization_with_string_template(self):
         """Test successful initialization with string template"""