diff --git a/openjudge/graders/base_grader.py b/openjudge/graders/base_grader.py index 9fbdb220..c498ebcc 100644 --- a/openjudge/graders/base_grader.py +++ b/openjudge/graders/base_grader.py @@ -6,7 +6,6 @@ either scores or rankings. """ -# import inspect from abc import ABC, abstractmethod from typing import Any, Dict @@ -60,7 +59,10 @@ def __init__( accessible to subclasses. Example: - >>> grader = BaseGrader( + >>> class MyGrader(BaseGrader): + ... async def aevaluate(self, **kwargs): + ... pass + >>> grader = MyGrader( ... name="accuracy_grader", ... mode=GraderMode.POINTWISE, ... description="Evaluates answer accuracy" @@ -189,7 +191,12 @@ def from_config( # Extract standard grader properties from a copy to avoid mutating the input config config_copy = dict(config) name = config_copy.pop("name", "") - mode = config_copy.pop("mode", GraderMode.POINTWISE) + mode_value = config_copy.pop("mode", GraderMode.POINTWISE) + # Convert string to GraderMode if necessary + if isinstance(mode_value, str): + mode = GraderMode(mode_value) + else: + mode = mode_value description = config_copy.pop("description", "") # Create and return new instance with remaining config items as kwargs diff --git a/openjudge/graders/code/__init__.py b/openjudge/graders/code/__init__.py index 665a3d72..2e626c59 100644 --- a/openjudge/graders/code/__init__.py +++ b/openjudge/graders/code/__init__.py @@ -8,10 +8,10 @@ extensible evaluation mechanisms for AI-generated content. """ -from open_judge.graders.code.code_excution import CodeExecutionGrader -from open_judge.graders.code.code_style import CodeStyleGrader -from open_judge.graders.code.patch_similarity import PatchSimilarityGrader -from open_judge.graders.code.syntax_checker import SyntaxCheckGrader +from .code_execution import CodeExecutionGrader +from .code_style import CodeStyleGrader +from .patch_similarity import PatchSimilarityGrader +from .syntax_checker import SyntaxCheckGrader __all__ = [ "CodeExecutionGrader", diff --git a/openjudge/graders/code/code_excution.py b/openjudge/graders/code/code_execution.py similarity index 100% rename from openjudge/graders/code/code_excution.py rename to openjudge/graders/code/code_execution.py diff --git a/openjudge/graders/llm_grader.py b/openjudge/graders/llm_grader.py index 453cfa36..10235c72 100644 --- a/openjudge/graders/llm_grader.py +++ b/openjudge/graders/llm_grader.py @@ -133,10 +133,13 @@ def __init__( ) elif isinstance(template, PromptTemplate): self.template = template + elif isinstance(template, list): + # Support list of message dicts or ChatMessage objects + self.template = PromptTemplate.from_prompt(template) elif isinstance(template, dict): self.template = PromptTemplate(**template) else: - raise ValueError("Template must be a str, dict or PromptTemplate object") + raise ValueError("Template must be a str, list, dict or PromptTemplate object") # Initialize model if isinstance(model, dict): diff --git a/openjudge/graders/multimodal/__init__.py b/openjudge/graders/multimodal/__init__.py index 93021610..f92493a7 100644 --- a/openjudge/graders/multimodal/__init__.py +++ b/openjudge/graders/multimodal/__init__.py @@ -8,10 +8,10 @@ - Text-to-image generation quality """ -from openjudge.graders.multimodal._internal import MLLMImage -from openjudge.graders.multimodal.image_coherence import ImageCoherenceGrader -from openjudge.graders.multimodal.image_helpfulness import ImageHelpfulnessGrader -from openjudge.graders.multimodal.text_to_image import TextToImageGrader +from ._internal import MLLMImage +from .image_coherence import ImageCoherenceGrader +from .image_helpfulness import ImageHelpfulnessGrader +from .text_to_image import TextToImageGrader __all__ = [ # Graders diff --git a/openjudge/graders/schema.py b/openjudge/graders/schema.py index 02a2dc33..b40b367f 100644 --- a/openjudge/graders/schema.py +++ b/openjudge/graders/schema.py @@ -142,9 +142,10 @@ class GraderRank(GraderResult): class GraderRankCallback(BaseModel): - """Callback for grader rank result, used for . + """Callback schema for LLM structured output in listwise grading. - Represents a ranking of items assigned by a grader along with a reason. + Used as the structured_model parameter in LLMGrader for LISTWISE mode. + The LLM returns this schema which is then converted to GraderRank. Attributes: rank (List[int]): The ranking of items. diff --git a/openjudge/models/base_chat_model.py b/openjudge/models/base_chat_model.py index 290d4b9f..50761382 100644 --- a/openjudge/models/base_chat_model.py +++ b/openjudge/models/base_chat_model.py @@ -29,7 +29,7 @@ class BaseChatModel(ABC): ... async def achat(self, *args, **kwargs): ... # Implementation here ... pass - >>> model = MyChatModel(model="qwen3-max", stream=False) + >>> model = MyChatModel(model="qwen3-32b", stream=False) >>> print(model.model) qwen3-32b """ @@ -52,11 +52,12 @@ def __init__( stream: Whether the model output is streaming or not. Example: - >>> model = BaseChatModel(model="qwen3-32b", stream=True) + >>> class MyChatModel(BaseChatModel): + ... async def achat(self, *args, **kwargs): + ... pass + >>> model = MyChatModel(model="qwen3-32b", stream=True) >>> print(model.model) qwen3-32b - >>> print(model.stream) - True """ self.model = model self.stream = stream @@ -102,9 +103,11 @@ def _validate_tool_choice( ValueError: If tool_choice is invalid. Example: - >>> model = BaseChatModel(model="test", stream=False) + >>> class MyChatModel(BaseChatModel): + ... async def achat(self, *args, **kwargs): + ... pass + >>> model = MyChatModel(model="test", stream=False) >>> model._validate_tool_choice("auto", None) # Valid - >>> # model._validate_tool_choice(123, None) # Would raise TypeError """ if not isinstance(tool_choice, str): raise TypeError( diff --git a/openjudge/models/formatter/dashscope_formatter.py b/openjudge/models/formatter/dashscope_formatter.py index b445ae8f..9a9e3122 100644 --- a/openjudge/models/formatter/dashscope_formatter.py +++ b/openjudge/models/formatter/dashscope_formatter.py @@ -83,7 +83,7 @@ def _convert_content_to_openai( if isinstance(content, str): return content - # If content is a list, process each part + # If content is a list, process each part (including empty list) if isinstance(content, list): openai_content = [] for part in content: @@ -143,7 +143,7 @@ def _convert_content_to_dashscope( if isinstance(content, str): return content - # If content is a list, process each part + # If content is a list, process each part (including empty list) if isinstance(content, list): dashscope_content = [] for part in content: diff --git a/openjudge/models/schema/prompt_template.py b/openjudge/models/schema/prompt_template.py index ea39ef81..309fed09 100644 --- a/openjudge/models/schema/prompt_template.py +++ b/openjudge/models/schema/prompt_template.py @@ -167,9 +167,8 @@ def to_messages( List[ChatMessage]: The messages for the specified language. Raises: - AssertionError: If the specified language is not available in a - multilingual template. - ValueError: If messages format is invalid. + ValueError: If the specified language is not available in a + multilingual template, or if messages format is invalid. Examples: >>> template = PromptTemplate(messages=[ChatMessage(role="user", content="Hello")]) @@ -182,16 +181,17 @@ def to_messages( [ChatMessage(role="user", content="Hello")] """ if isinstance(self.messages, list): - messages = self.messages - elif isinstance(self.messages, dict): + return self.messages + + if isinstance(self.messages, dict): if not language: language = LanguageEnum.EN - assert language in self.messages - messages = self.messages.get(language, []) - else: - raise ValueError("Invalid messages") + if language not in self.messages: + available = [lang.value for lang in self.messages.keys()] + raise ValueError(f"Language '{language.value}' not found. Available: {available}") + return self.messages[language] - return messages + raise ValueError("Invalid messages format") @classmethod def from_prompt(cls, prompt: Prompt) -> "PromptTemplate": @@ -280,7 +280,7 @@ def format( messages = [message.format(**kwargs).to_dict() for message in messages] return messages - def get_prompt(self, language: LanguageEnum = None) -> Dict[str, List[Dict[str, str]]]: + def get_prompt(self, language: LanguageEnum | None = None) -> Dict[str, List[Dict[str, str]]]: """Return the core prompts (role, content) information of the messages, in a {language: list[{'role': txt, 'content': txt}]} dictionary. """ diff --git a/openjudge/utils/concurrency.py b/openjudge/utils/concurrency.py index 82b3701e..352daff4 100644 --- a/openjudge/utils/concurrency.py +++ b/openjudge/utils/concurrency.py @@ -6,6 +6,9 @@ """ import asyncio +from typing import Awaitable, TypeVar + +T = TypeVar("T") class ConcurrencyManager: @@ -61,15 +64,15 @@ def get_max_concurrency(self) -> int: """ return self._max_concurrency - async def run_with_concurrency_control(self, coro): + async def run_with_concurrency_control(self, coro: Awaitable[T]) -> T: """ Run a coroutine with concurrency control. Args: - coro: The coroutine to run + coro: The coroutine to run. Returns: - The result of the coroutine + T: The result of the coroutine. """ async with self._semaphore: return await coro diff --git a/openjudge/utils/instance.py b/openjudge/utils/instance.py index 6e893896..1b13aec5 100644 --- a/openjudge/utils/instance.py +++ b/openjudge/utils/instance.py @@ -64,7 +64,7 @@ class should be subclass of. If provided, will check >>> # From config dict >>> config = { ... 'class_name': 'StringMatchGrader', - ... 'module_path': 'openjudge.text.string_match', + ... 'module_path': 'openjudge.graders.text.string_match', ... 'kwargs': {'ignore_case': True} ... } >>> # instance = init_instance_by_config(config) @@ -74,8 +74,8 @@ class should be subclass of. If provided, will check >>> # instance = init_instance_by_config(existing_instance) >>> >>> # With type checking - >>> # from openjudge.grader.base import Grader - >>> # instance = init_instance_by_config(config, accept_type=Grader) + >>> # from openjudge.graders.base_grader import BaseGrader + >>> # instance = init_instance_by_config(config, accept_type=BaseGrader) """ # If config is already an instance, just check its type if not isinstance(config, dict): diff --git a/openjudge/utils/tokenizer.py b/openjudge/utils/tokenizer.py index b7eb1415..b7f8c17e 100644 --- a/openjudge/utils/tokenizer.py +++ b/openjudge/utils/tokenizer.py @@ -80,7 +80,7 @@ def preprocess_text(self, text: str, to_lower: bool = False) -> str: str: Preprocessed text. Example: - >>> tokenizer = BaseTokenizer(name="test") + >>> tokenizer = SimpleTokenizer() >>> result = tokenizer.preprocess_text(" Hello World ", to_lower=True) >>> print(result) hello world @@ -139,8 +139,11 @@ def tokenize(self, text: str) -> List[str]: # Convert token ids back to strings for comparison token_strings = [encoding.decode([token]) for token in tokens] return token_strings - except Exception: - # Fallback to simple splitting if tiktoken fails + except ImportError: + # Fallback to simple splitting if tiktoken not installed + return text.split() + except KeyError: + # Fallback if encoding name is invalid return text.split() diff --git a/openjudge/utils/utils.py b/openjudge/utils/utils.py index 2d8e775c..92c971c1 100644 --- a/openjudge/utils/utils.py +++ b/openjudge/utils/utils.py @@ -40,7 +40,8 @@ def repair_and_load_json( repaired = json_str try: repaired = repair_json(json_str) - except Exception: + except (ValueError, TypeError): + # repair_json may fail on malformed input, keep original string pass try: @@ -136,8 +137,7 @@ def create_tool_from_base_model( Note: The function automatically removes the 'title' field from the JSON schema to ensure compatibility with function calling - format. This is handled by the internal [_remove_title_field] - (file://.openjudge/utils/utils.py#L33-L55) function. + format. This is handled by the internal `_remove_title_field` function. """ schema = structured_model.model_json_schema() @@ -200,5 +200,6 @@ def trim_and_load_json(response: str, metric: Any = None) -> Dict[str, Any]: except json.JSONDecodeError as e: error_msg = f"Failed to parse JSON from response: {e}\nResponse: {response[:200]}" if metric: - logger.error(f"{metric.name}: {error_msg}") + metric_name = getattr(metric, "name", "unknown_metric") + logger.error(f"{metric_name}: {error_msg}") raise ValueError(error_msg) from e diff --git a/tests/graders/test_llm_grader.py b/tests/graders/test_llm_grader.py index ddeaf978..4b230f88 100644 --- a/tests/graders/test_llm_grader.py +++ b/tests/graders/test_llm_grader.py @@ -67,7 +67,7 @@ def test_initialization_failure_without_template(self): model=AsyncMock(), name="foo", ) - assert "Template must be a str, dict or PromptTemplate object" in str(error_obj.value) + assert "Template must be a str, list, dict or PromptTemplate object" in str(error_obj.value) def test_initialization_with_string_template(self): """Test successful initialization with string template"""