Merge pull request #2900 from Agenta-AI/release/v0.62.1

mmabrouk · web-flow · commit c3c924ddc595 · 2025-11-11T18:47:17.000+01:00
v0.62.1
diff --git a/api/oss/src/resources/evaluators/evaluators.py b/api/oss/src/resources/evaluators/evaluators.py
@@ -229,12 +229,12 @@
                             "description": "Extract information from the user's response.",
                             "type": "object",
                             "properties": {
-                                "correctness": {
+                                "score": {
                                     "type": "boolean",
                                     "description": "The grade results",
                                 }
                             },
-                            "required": ["correctness"],
+                            "required": ["score"],
                             "strict": True,
                         },
                     },
@@ -264,12 +264,12 @@
                             "description": "Extract information from the user's response.",
                             "type": "object",
                             "properties": {
-                                "correctness": {
+                                "score": {
                                     "type": "boolean",
                                     "description": "The hallucination detection result",
                                 }
                             },
-                            "required": ["correctness"],
+                            "required": ["score"],
                             "strict": True,
                         },
                     },
@@ -339,12 +339,12 @@
                         "description": "Extract information from the user's response.",
                         "type": "object",
                         "properties": {
-                            "correctness": {
+                            "score": {
                                 "type": "boolean",
                                 "description": "The grade results",
                             }
                         },
-                        "required": ["correctness"],
+                        "required": ["score"],
                         "strict": True,
                     },
                 },
diff --git a/api/oss/src/services/converters.py b/api/oss/src/services/converters.py
@@ -13,7 +13,6 @@
     HumanEvaluationScenario,
     EvaluationScenarioOutput,
 )
-from oss.src.services import db_manager
 from oss.src.models.db_models import (
     EvaluationDB,
     HumanEvaluationDB,
diff --git a/api/pyproject.toml b/api/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "api"
-version = "0.62.0"
+version = "0.62.1"
 description = "Agenta API"
 authors = [
     { name = "Mahmoud Mabrouk", email = "mahmoud@agenta.ai" },
diff --git a/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx b/docs/blog/entries/customize-llm-as-a-judge-output-schemas.mdx
@@ -0,0 +1,71 @@
+---
+title: "Customize LLM-as-a-Judge Output Schemas"
+slug: customize-llm-as-a-judge-output-schemas
+date: 2025-11-10
+tags: [v0.62.0]
+description: "Learn how to customize LLM-as-a-Judge evaluator output schemas with binary, multiclass, or custom JSON formats. Enable reasoning for better evaluation quality and structure feedback to match your workflow needs."
+---
+
+import Image from "@theme/IdealImage";
+
+The LLM-as-a-Judge evaluator now supports custom output schemas. You can define exactly what feedback structure you need for your evaluations.
+
+
+<div style={{display: 'flex', justifyContent: 'center', gap: '24px', margin: '20px 0'}}>
+  <Image
+    img={require('/static/images/changelog/changelog-llm-as-a-judge-response-1.png')}
+    alt="Custom output schemas in LLM-as-a-Judge - Example 1"
+    style={{width: '48%', minWidth: 0}}
+  />
+  <Image
+    img={require('/static/images/changelog/changelog-llm-as-a-judge-response-2.png')}
+    alt="Custom output schemas in LLM-as-a-Judge - Example 2"
+    style={{width: '48%', minWidth: 0}}
+  />
+</div>
+
+## What's New
+
+### **Flexible Output Types**
+Configure the evaluator to return different types of outputs:
+- **Binary**: Return a simple yes/no or pass/fail score
+- **Multiclass**: Choose from multiple predefined categories
+- **Custom JSON**: Define any structure that fits your use case
+
+### **Include Reasoning for Better Quality**
+Enable the reasoning option to have the LLM explain its evaluation. This improves prediction quality because the model thinks through its assessment before providing a score.
+
+When you include reasoning, the evaluator returns both the score and a detailed explanation of how it arrived at that judgment.
+
+### **Advanced: Raw JSON Schema**
+For complete control, provide a raw JSON schema. The evaluator will return responses that match your exact structure.
+
+This lets you capture multiple scores, categorical labels, confidence levels, and custom fields in a single evaluation pass. You can structure the output however your workflow requires.
+
+### **Use Custom Schemas in Evaluation**
+Once configured, your custom schemas work seamlessly in the evaluation workflow. The results display in the evaluation dashboard with all your custom fields visible.
+
+This makes it easy to analyze multiple dimensions of quality in a single evaluation run.
+
+## Example Use Cases
+
+**Binary Score with Reasoning:**
+Return a simple correct/incorrect judgment along with an explanation of why the output succeeded or failed.
+
+**Multi-dimensional Feedback:**
+Capture separate scores for accuracy, relevance, completeness, and tone in one evaluation. Include reasoning for each dimension.
+
+**Structured Classification:**
+Return categorical labels (excellent/good/fair/poor) along with specific issues found and suggestions for improvement.
+
+## Getting Started
+
+To use custom output schemas with LLM-as-a-Judge:
+
+1. Open the evaluator configuration
+2. Select your desired output type (binary, multiclass, or custom)
+3. Enable reasoning if you want explanations
+4. For advanced use, provide your JSON schema
+5. Run your evaluation
+
+Learn more in the [LLM-as-a-Judge documentation](/evaluation/configure-evaluators/llm-as-a-judge).
diff --git a/docs/blog/main.mdx b/docs/blog/main.mdx
@@ -10,6 +10,33 @@ import Image from "@theme/IdealImage";
 
 <section class="changelog">
 
+### [Customize LLM-as-a-Judge Output Schemas](/changelog/customize-llm-as-a-judge-output-schemas)
+
+_10 November 2025_
+
+**v0.62.0**
+
+<div style={{display: 'flex', justifyContent: 'center', gap: '24px', margin: '20px 0'}}>
+  <Image
+    img={require('/static/images/changelog/changelog-llm-as-a-judge-response-1.png')}
+    alt="Custom output schemas in LLM-as-a-Judge - Example 1"
+    style={{width: '48%', minWidth: 0}}
+  />
+  <Image
+    img={require('/static/images/changelog/changelog-llm-as-a-judge-response-2.png')}
+    alt="Custom output schemas in LLM-as-a-Judge - Example 2"
+    style={{width: '48%', minWidth: 0}}
+  />
+</div>
+
+The LLM-as-a-Judge evaluator now supports custom output schemas. Create multiple feedback outputs per evaluator with any structure you need.
+
+You can configure output types (binary, multiclass), include reasoning to improve prediction quality, or provide a raw JSON schema with any structure you define. Use these custom schemas in your evaluations to capture exactly the feedback you need.
+
+Learn more in the [LLM-as-a-Judge documentation](/evaluation/configure-evaluators/llm-as-a-judge).
+
+---
+
 ### [Documentation Overhaul](/changelog/documentation-architecture-overhaul)
 
 _3 November 2025_
diff --git a/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx b/docs/docs/evaluation/configure-evaluators/05-llm-as-a-judge.mdx
@@ -2,6 +2,8 @@
 title: "LLM-as-a-Judge"
 ---
 
+import Image from "@theme/IdealImage";
+
 LLM-as-a-Judge is an evaluator that uses an LLM to assess LLM outputs. It's particularly useful for evaluating text generation tasks or chatbots where there's no single correct answer.
 
 ![Configuration of LLM-as-a-judge](/images/evaluation/configure-evaluators-3.png)
@@ -56,4 +58,28 @@ ANSWER ONLY THE SCORE. DO NOT USE MARKDOWN. DO NOT PROVIDE ANYTHING OTHER THAN T
 
 ### The Model
 
-The model can be configured to select one of the supported options (`gpt-3.5-turbo`, `gpt-4o`, `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `claude-3-5-sonnet`, `claude-3-5-haiku`, `claude-3-5-opus`). To use LLM-as-a-Judge, you'll need to set your OpenAI or Anthropic API key in the settings. The key is saved locally and only sent to our servers for evaluation—it's not stored there.
+The model can be configured to select one of the supported options (`gpt-4o`, `gpt-5`, `gpt-5-mini`, `gpt-5-nano`, `claude-3-5-sonnet`, `claude-3-5-haiku`, `claude-3-5-opus`). To use LLM-as-a-Judge, you'll need to set your OpenAI or Anthropic API key in the settings. The key is saved locally and only sent to our servers for evaluation; it's not stored there.
+
+### Output Schema
+
+You can configure the output schema to control what the LLM evaluator returns. This allows you to get structured feedback tailored to your evaluation needs.
+
+#### Basic Configuration
+
+The basic configuration lets you choose from common output types:
+
+- **Binary**: Returns a simple pass/fail or yes/no judgment
+- **Multiclass**: Returns a classification from a predefined set of categories
+- **Continuous**: Returns a score between a minimum and maximum value
+
+You can also enable **Include Reasoning** to have the evaluator explain its judgment. This option significantly improves the quality of evaluations by making the LLM's decision process transparent.
+
+<Image img={require('/static/images/changelog/changelog-llm-as-a-judge-response-1.png')} alt="Basic output schema configuration" style={{display: 'block', margin: '20px auto', textAlign: 'center'}} />
+
+
+#### Advanced Configuration
+
+For complete control, you can provide a custom JSON schema. This lets you define any output structure you need. For example, you could return multiple scores, confidence levels, detailed feedback categories, or any combination of fields.
+
+
+<Image img={require('/static/images/changelog/changelog-llm-as-a-judge-response-2.png')} alt="Advanced output schema configuration" style={{display: 'block', margin: '20px auto', textAlign: 'center'}} />
diff --git a/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png b/docs/static/images/changelog/changelog-llm-as-a-judge-response-1.png
diff --git a/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png b/docs/static/images/changelog/changelog-llm-as-a-judge-response-2.png
diff --git a/sdk/agenta/sdk/workflows/handlers.py b/sdk/agenta/sdk/workflows/handlers.py
@@ -511,20 +511,24 @@ def field_match_test_v0(
     correct_answer = inputs[correct_answer_key]
 
     if not isinstance(outputs, str) and not isinstance(outputs, dict):
-        raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+        # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+        return {"success": False}
 
     outputs_dict = outputs
     if isinstance(outputs, str):
         try:
             outputs_dict = loads(outputs)
         except json.JSONDecodeError as e:
-            raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
+            # raise InvalidOutputsV0Error(expected="dict", got=outputs) from e
+            return {"success": False}
 
     if not isinstance(outputs_dict, dict):
-        raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+        # raise InvalidOutputsV0Error(expected=["dict", "str"], got=outputs)
+        return {"success": False}
 
     if not json_field in outputs_dict:
-        raise MissingOutputV0Error(path=json_field)
+        # raise MissingOutputV0Error(path=json_field)
+        return {"success": False}
 
     # --------------------------------------------------------------------------
     success = outputs_dict[json_field] == correct_answer
diff --git a/sdk/pyproject.toml b/sdk/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "agenta"
-version = "0.62.0"
+version = "0.62.1"
 description = "The SDK for agenta is an open-source LLMOps platform."
 readme = "README.md"
 authors = [
diff --git a/web/ee/package.json b/web/ee/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@agenta/ee",
-    "version": "0.62.0",
+    "version": "0.62.1",
     "private": true,
     "engines": {
         "node": ">=18"
diff --git a/web/oss/package.json b/web/oss/package.json
@@ -1,6 +1,6 @@
 {
     "name": "@agenta/oss",
-    "version": "0.62.0",
+    "version": "0.62.1",
     "private": true,
     "engines": {
         "node": ">=18"
diff --git a/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts b/web/oss/src/components/pages/evaluations/autoEvaluation/EvaluatorsModal/ConfigureEvaluator/JSONSchema/JSONSchemaGenerator.ts
@@ -23,15 +23,15 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema {
     const {responseFormat, includeReasoning, continuousConfig, categoricalOptions} = config
 
     const properties: Record<string, any> = {}
-    const required: string[] = ["correctness"]
+    const required: string[] = ["score"]
 
     // Base description is always "The grade results"
     const baseDescription = "The grade results"
 
-    // Add the main correctness field based on response format
+    // Add the main score field based on response format
     switch (responseFormat) {
         case "continuous":
-            properties.correctness = {
+            properties.score = {
                 type: "number",
                 description: baseDescription,
                 minimum: continuousConfig?.minimum ?? 0,
@@ -40,7 +40,7 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema {
             break
 
         case "boolean":
-            properties.correctness = {
+            properties.score = {
                 type: "boolean",
                 description: baseDescription,
             }
@@ -53,14 +53,14 @@ export function generateJSONSchema(config: SchemaConfig): GeneratedJSONSchema {
                     .map((opt) => `"${opt.name}": ${opt.description}`)
                     .join("| ")
 
-                properties.correctness = {
+                properties.score = {
                     type: "string",
                     description: `${baseDescription}. Categories: ${categoryDescriptions}`,
                     enum: enumValues,
                 }
             } else {
                 // Fallback if no categories defined
-                properties.correctness = {
+                properties.score = {
                     type: "string",
                     description: baseDescription,
                 }
@@ -97,43 +97,43 @@ export function parseJSONSchema(schemaString: string): SchemaConfig | null {
         // Handle both old format (direct schema) and new format (with name wrapper)
         const schema = parsed.schema || parsed
 
-        if (!schema.properties || !schema.properties.correctness) {
+        if (!schema.properties || !schema.properties.score) {
             return null
         }
 
-        const correctness = schema.properties.correctness
+        const score = schema.properties.score
         const hasReasoning = !!schema.properties.comment
 
         let responseFormat: SchemaConfig["responseFormat"] = "boolean"
         let continuousConfig: SchemaConfig["continuousConfig"]
         let categoricalOptions: SchemaConfig["categoricalOptions"]
 
-        if (correctness.type === "number") {
+        if (score.type === "number") {
             responseFormat = "continuous"
             continuousConfig = {
-                minimum: correctness.minimum ?? 0,
-                maximum: correctness.maximum ?? 10,
+                minimum: score.minimum ?? 0,
+                maximum: score.maximum ?? 10,
             }
-        } else if (correctness.type === "boolean") {
+        } else if (score.type === "boolean") {
             responseFormat = "boolean"
-        } else if (correctness.type === "string" && correctness.enum) {
+        } else if (score.type === "string" && score.enum) {
             responseFormat = "categorical"
 
             // Parse category descriptions from the description field
-            const desc = correctness.description || ""
+            const desc = score.description || ""
             const categoriesMatch = desc.match(/Categories: (.+)/)
 
             if (categoriesMatch) {
                 const categoriesStr = categoriesMatch[1]
                 const categoryPairs = categoriesStr.split("| ")
 
-                categoricalOptions = correctness.enum.map((name: string) => {
+                categoricalOptions = score.enum.map((name: string) => {
                     const pair = categoryPairs.find((p: string) => p.startsWith(`"${name}":`))
                     const description = pair ? pair.split(": ")[1] || "" : ""
                     return {name, description}
                 })
             } else {
-                categoricalOptions = correctness.enum.map((name: string) => ({
+                categoricalOptions = score.enum.map((name: string) => ({
                     name,
                     description: "",
                 }))
diff --git a/web/package.json b/web/package.json
@@ -1,6 +1,6 @@
 {
     "name": "agenta-web",
-    "version": "0.62.0",
+    "version": "0.62.1",
     "workspaces": [
         "ee",
         "oss",

Original file line number	Diff line number	Diff line change
`@@ -13,7 +13,6 @@`
`13`	`13`	`HumanEvaluationScenario,`
`14`	`14`	`EvaluationScenarioOutput,`
`15`	`15`	`)`
`16`		`-from oss.src.services import db_manager`
`17`	`16`	`from oss.src.models.db_models import (`
`18`	`17`	`EvaluationDB,`
`19`	`18`	`HumanEvaluationDB,`