Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 10 additions & 3 deletions openjudge/graders/base_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
either scores or rankings.
"""

# import inspect
from abc import ABC, abstractmethod
from typing import Any, Dict

Expand Down Expand Up @@ -60,7 +59,10 @@ def __init__(
accessible to subclasses.

Example:
>>> grader = BaseGrader(
>>> class MyGrader(BaseGrader):
... async def aevaluate(self, **kwargs):
... pass
>>> grader = MyGrader(
... name="accuracy_grader",
... mode=GraderMode.POINTWISE,
... description="Evaluates answer accuracy"
Expand Down Expand Up @@ -189,7 +191,12 @@ def from_config(
# Extract standard grader properties from a copy to avoid mutating the input config
config_copy = dict(config)
name = config_copy.pop("name", "")
mode = config_copy.pop("mode", GraderMode.POINTWISE)
mode_value = config_copy.pop("mode", GraderMode.POINTWISE)
# Convert string to GraderMode if necessary
if isinstance(mode_value, str):
mode = GraderMode(mode_value)
else:
mode = mode_value
description = config_copy.pop("description", "")

# Create and return new instance with remaining config items as kwargs
Expand Down
8 changes: 4 additions & 4 deletions openjudge/graders/code/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
extensible evaluation mechanisms for AI-generated content.
"""

from open_judge.graders.code.code_excution import CodeExecutionGrader
from open_judge.graders.code.code_style import CodeStyleGrader
from open_judge.graders.code.patch_similarity import PatchSimilarityGrader
from open_judge.graders.code.syntax_checker import SyntaxCheckGrader
from .code_execution import CodeExecutionGrader
from .code_style import CodeStyleGrader
from .patch_similarity import PatchSimilarityGrader
from .syntax_checker import SyntaxCheckGrader

__all__ = [
"CodeExecutionGrader",
Expand Down
5 changes: 4 additions & 1 deletion openjudge/graders/llm_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,10 +133,13 @@ def __init__(
)
elif isinstance(template, PromptTemplate):
self.template = template
elif isinstance(template, list):
# Support list of message dicts or ChatMessage objects
self.template = PromptTemplate.from_prompt(template)
elif isinstance(template, dict):
self.template = PromptTemplate(**template)
else:
raise ValueError("Template must be a str, dict or PromptTemplate object")
raise ValueError("Template must be a str, list, dict or PromptTemplate object")

# Initialize model
if isinstance(model, dict):
Expand Down
8 changes: 4 additions & 4 deletions openjudge/graders/multimodal/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,10 @@
- Text-to-image generation quality
"""

from openjudge.graders.multimodal._internal import MLLMImage
from openjudge.graders.multimodal.image_coherence import ImageCoherenceGrader
from openjudge.graders.multimodal.image_helpfulness import ImageHelpfulnessGrader
from openjudge.graders.multimodal.text_to_image import TextToImageGrader
from ._internal import MLLMImage
from .image_coherence import ImageCoherenceGrader
from .image_helpfulness import ImageHelpfulnessGrader
from .text_to_image import TextToImageGrader

__all__ = [
# Graders
Expand Down
5 changes: 3 additions & 2 deletions openjudge/graders/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,10 @@ class GraderRank(GraderResult):


class GraderRankCallback(BaseModel):
"""Callback for grader rank result, used for .
"""Callback schema for LLM structured output in listwise grading.

Represents a ranking of items assigned by a grader along with a reason.
Used as the structured_model parameter in LLMGrader for LISTWISE mode.
The LLM returns this schema which is then converted to GraderRank.

Attributes:
rank (List[int]): The ranking of items.
Expand Down
15 changes: 9 additions & 6 deletions openjudge/models/base_chat_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class BaseChatModel(ABC):
... async def achat(self, *args, **kwargs):
... # Implementation here
... pass
>>> model = MyChatModel(model="qwen3-max", stream=False)
>>> model = MyChatModel(model="qwen3-32b", stream=False)
>>> print(model.model)
qwen3-32b
"""
Expand All @@ -52,11 +52,12 @@ def __init__(
stream: Whether the model output is streaming or not.

Example:
>>> model = BaseChatModel(model="qwen3-32b", stream=True)
>>> class MyChatModel(BaseChatModel):
... async def achat(self, *args, **kwargs):
... pass
>>> model = MyChatModel(model="qwen3-32b", stream=True)
>>> print(model.model)
qwen3-32b
>>> print(model.stream)
True
"""
self.model = model
self.stream = stream
Expand Down Expand Up @@ -102,9 +103,11 @@ def _validate_tool_choice(
ValueError: If tool_choice is invalid.

Example:
>>> model = BaseChatModel(model="test", stream=False)
>>> class MyChatModel(BaseChatModel):
... async def achat(self, *args, **kwargs):
... pass
>>> model = MyChatModel(model="test", stream=False)
>>> model._validate_tool_choice("auto", None) # Valid
>>> # model._validate_tool_choice(123, None) # Would raise TypeError
"""
if not isinstance(tool_choice, str):
raise TypeError(
Expand Down
4 changes: 2 additions & 2 deletions openjudge/models/formatter/dashscope_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ def _convert_content_to_openai(
if isinstance(content, str):
return content

# If content is a list, process each part
# If content is a list, process each part (including empty list)
if isinstance(content, list):
openai_content = []
for part in content:
Expand Down Expand Up @@ -143,7 +143,7 @@ def _convert_content_to_dashscope(
if isinstance(content, str):
return content

# If content is a list, process each part
# If content is a list, process each part (including empty list)
if isinstance(content, list):
dashscope_content = []
for part in content:
Expand Down
22 changes: 11 additions & 11 deletions openjudge/models/schema/prompt_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,9 +167,8 @@ def to_messages(
List[ChatMessage]: The messages for the specified language.

Raises:
AssertionError: If the specified language is not available in a
multilingual template.
ValueError: If messages format is invalid.
ValueError: If the specified language is not available in a
multilingual template, or if messages format is invalid.

Examples:
>>> template = PromptTemplate(messages=[ChatMessage(role="user", content="Hello")])
Expand All @@ -182,16 +181,17 @@ def to_messages(
[ChatMessage(role="user", content="Hello")]
"""
if isinstance(self.messages, list):
messages = self.messages
elif isinstance(self.messages, dict):
return self.messages

if isinstance(self.messages, dict):
if not language:
language = LanguageEnum.EN
assert language in self.messages
messages = self.messages.get(language, [])
else:
raise ValueError("Invalid messages")
if language not in self.messages:
available = [lang.value for lang in self.messages.keys()]
raise ValueError(f"Language '{language.value}' not found. Available: {available}")
return self.messages[language]

return messages
raise ValueError("Invalid messages format")

@classmethod
def from_prompt(cls, prompt: Prompt) -> "PromptTemplate":
Expand Down Expand Up @@ -280,7 +280,7 @@ def format(
messages = [message.format(**kwargs).to_dict() for message in messages]
return messages

def get_prompt(self, language: LanguageEnum = None) -> Dict[str, List[Dict[str, str]]]:
def get_prompt(self, language: LanguageEnum | None = None) -> Dict[str, List[Dict[str, str]]]:
"""Return the core prompts (role, content) information of the messages,
in a {language: list[{'role': txt, 'content': txt}]} dictionary.
"""
Expand Down
9 changes: 6 additions & 3 deletions openjudge/utils/concurrency.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
"""

import asyncio
from typing import Awaitable, TypeVar

T = TypeVar("T")


class ConcurrencyManager:
Expand Down Expand Up @@ -61,15 +64,15 @@ def get_max_concurrency(self) -> int:
"""
return self._max_concurrency

async def run_with_concurrency_control(self, coro):
async def run_with_concurrency_control(self, coro: Awaitable[T]) -> T:
"""
Run a coroutine with concurrency control.

Args:
coro: The coroutine to run
coro: The coroutine to run.

Returns:
The result of the coroutine
T: The result of the coroutine.
"""
async with self._semaphore:
return await coro
Expand Down
6 changes: 3 additions & 3 deletions openjudge/utils/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class should be subclass of. If provided, will check
>>> # From config dict
>>> config = {
... 'class_name': 'StringMatchGrader',
... 'module_path': 'openjudge.text.string_match',
... 'module_path': 'openjudge.graders.text.string_match',
... 'kwargs': {'ignore_case': True}
... }
>>> # instance = init_instance_by_config(config)
Expand All @@ -74,8 +74,8 @@ class should be subclass of. If provided, will check
>>> # instance = init_instance_by_config(existing_instance)
>>>
>>> # With type checking
>>> # from openjudge.grader.base import Grader
>>> # instance = init_instance_by_config(config, accept_type=Grader)
>>> # from openjudge.graders.base_grader import BaseGrader
>>> # instance = init_instance_by_config(config, accept_type=BaseGrader)
"""
# If config is already an instance, just check its type
if not isinstance(config, dict):
Expand Down
9 changes: 6 additions & 3 deletions openjudge/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def preprocess_text(self, text: str, to_lower: bool = False) -> str:
str: Preprocessed text.

Example:
>>> tokenizer = BaseTokenizer(name="test")
>>> tokenizer = SimpleTokenizer()
>>> result = tokenizer.preprocess_text(" Hello World ", to_lower=True)
>>> print(result)
hello world
Expand Down Expand Up @@ -139,8 +139,11 @@ def tokenize(self, text: str) -> List[str]:
# Convert token ids back to strings for comparison
token_strings = [encoding.decode([token]) for token in tokens]
return token_strings
except Exception:
# Fallback to simple splitting if tiktoken fails
except ImportError:
# Fallback to simple splitting if tiktoken not installed
return text.split()
except KeyError:
# Fallback if encoding name is invalid
return text.split()


Expand Down
9 changes: 5 additions & 4 deletions openjudge/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,8 @@ def repair_and_load_json(
repaired = json_str
try:
repaired = repair_json(json_str)
except Exception:
except (ValueError, TypeError):
# repair_json may fail on malformed input, keep original string
pass

try:
Expand Down Expand Up @@ -136,8 +137,7 @@ def create_tool_from_base_model(
Note:
The function automatically removes the 'title' field from
the JSON schema to ensure compatibility with function calling
format. This is handled by the internal [_remove_title_field]
(file://.openjudge/utils/utils.py#L33-L55) function.
format. This is handled by the internal `_remove_title_field` function.
"""
schema = structured_model.model_json_schema()

Expand Down Expand Up @@ -200,5 +200,6 @@ def trim_and_load_json(response: str, metric: Any = None) -> Dict[str, Any]:
except json.JSONDecodeError as e:
error_msg = f"Failed to parse JSON from response: {e}\nResponse: {response[:200]}"
if metric:
logger.error(f"{metric.name}: {error_msg}")
metric_name = getattr(metric, "name", "unknown_metric")
logger.error(f"{metric_name}: {error_msg}")
raise ValueError(error_msg) from e
2 changes: 1 addition & 1 deletion tests/graders/test_llm_grader.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def test_initialization_failure_without_template(self):
model=AsyncMock(),
name="foo",
)
assert "Template must be a str, dict or PromptTemplate object" in str(error_obj.value)
assert "Template must be a str, list, dict or PromptTemplate object" in str(error_obj.value)

def test_initialization_with_string_template(self):
"""Test successful initialization with string template"""
Expand Down