AzureOpenAI model grader support in evals (#41599)

nagkumar91 · Nagkumar Arkalgud · Nagkumar Arkalgud · web-flow · commit bb3743291e78 · 2025-06-26T15:58:01.000-04:00
* Prepare evals SDK Release

* Fix bug

* Fix for ADV_CONV for FDP projects

* Update release date

* feat: Add AzureOpenAIScoreModelGrader for continuous scoring evaluation

- Implement AzureOpenAIScoreModelGrader in _aoai/score_model_grader.py
- Update module exports in _aoai/__init__.py and __init__.py
- Register grader in _evaluate/_evaluate_aoai.py grader registry
- Add comprehensive sample script with real credentials support
- Include integration plan documentation
- Support conversation-style input, score ranges, and sampling parameters
- Handle template variables using {{ item.field }} syntax
- Provide fallback demo mode for configuration testing

* Add tests

* Removed the plan md

* Add evaluator to exceptions for save eval e2e test

* update changelog and sample

---------

Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@naarkalg-work-mac.local&gt;
Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@Mac.lan&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md b/sdk/evaluation/azure-ai-evaluation/CHANGELOG.md
@@ -4,6 +4,9 @@
 
 ### Features Added
 
+- Added support for Azure Open AI evaluation via `AzureOpenAIScoreModelGrader` class, which serves as a wrapper around Azure Open AI score model configurations. This new grader object can be supplied to the main `evaluate` method as if it were a normal callable evaluator.
+
+
 ### Bugs Fixed
 
 - Significant improvements to IntentResolution evaluator. New version has less variance, is nearly 2x faster and consumes fewer tokens.
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/__init__.py
@@ -45,6 +45,7 @@
 from ._aoai.label_grader import AzureOpenAILabelGrader
 from ._aoai.string_check_grader import AzureOpenAIStringCheckGrader
 from ._aoai.text_similarity_grader import AzureOpenAITextSimilarityGrader
+from ._aoai.score_model_grader import AzureOpenAIScoreModelGrader
 
 
 _patch_all = []
@@ -102,6 +103,7 @@
     "AzureOpenAILabelGrader",
     "AzureOpenAIStringCheckGrader",
     "AzureOpenAITextSimilarityGrader",
+    "AzureOpenAIScoreModelGrader",
 ]
 
 __all__.extend([p for p in _patch_all if p not in __all__])
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_aoai/score_model_grader.py
@@ -0,0 +1,90 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+from typing import Any, Dict, Union, List, Optional
+
+from azure.ai.evaluation._model_configurations import AzureOpenAIModelConfiguration, OpenAIModelConfiguration
+from openai.types.graders import ScoreModelGrader
+from azure.ai.evaluation._common._experimental import experimental
+
+from .aoai_grader import AzureOpenAIGrader
+
+
+@experimental
+class AzureOpenAIScoreModelGrader(AzureOpenAIGrader):
+    """
+    Wrapper class for OpenAI's score model graders.
+
+    Enables continuous scoring evaluation with custom prompts and flexible
+    conversation-style inputs. Supports configurable score ranges and
+    pass thresholds for binary classification.
+
+    Supplying a ScoreModelGrader to the `evaluate` method will cause an
+    asynchronous request to evaluate the grader via the OpenAI API. The
+    results of the evaluation will then be merged into the standard
+    evaluation results.
+
+    :param model_config: The model configuration to use for the grader.
+    :type model_config: Union[
+        ~azure.ai.evaluation.AzureOpenAIModelConfiguration,
+        ~azure.ai.evaluation.OpenAIModelConfiguration
+    ]
+    :param input: The input messages for the grader. List of conversation
+        messages with role and content.
+    :type input: List[Dict[str, str]]
+    :param model: The model to use for the evaluation.
+    :type model: str
+    :param name: The name of the grader.
+    :type name: str
+    :param range: The range of the score. Defaults to [0, 1].
+    :type range: Optional[List[float]]
+    :param pass_threshold: Score threshold for pass/fail classification.
+        Defaults to midpoint of range.
+    :type pass_threshold: Optional[float]
+    :param sampling_params: The sampling parameters for the model.
+    :type sampling_params: Optional[Dict[str, Any]]
+    :param kwargs: Additional keyword arguments to pass to the grader.
+    :type kwargs: Any
+    """
+
+    id = "aoai://score_model"
+
+    def __init__(
+        self,
+        *,
+        model_config: Union[AzureOpenAIModelConfiguration, OpenAIModelConfiguration],
+        input: List[Dict[str, str]],
+        model: str,
+        name: str,
+        range: Optional[List[float]] = None,
+        pass_threshold: Optional[float] = None,
+        sampling_params: Optional[Dict[str, Any]] = None,
+        **kwargs: Any,
+    ):
+        # Validate range and pass_threshold
+        if range is not None:
+            if len(range) != 2 or range[0] >= range[1]:
+                raise ValueError("range must be a list of two numbers [min, max] where min < max")
+        else:
+            range = [0.0, 1.0]  # Default range
+
+        if pass_threshold is not None:
+            if range and (pass_threshold < range[0] or pass_threshold > range[1]):
+                raise ValueError(f"pass_threshold {pass_threshold} must be within range {range}")
+        else:
+            pass_threshold = (range[0] + range[1]) / 2  # Default to midpoint
+
+        # Store pass_threshold as instance attribute
+        self.pass_threshold = pass_threshold
+
+        # Create OpenAI ScoreModelGrader instance
+        grader_kwargs = {"input": input, "model": model, "name": name, "type": "score_model"}
+
+        if range is not None:
+            grader_kwargs["range"] = range
+        if sampling_params is not None:
+            grader_kwargs["sampling_params"] = sampling_params
+
+        grader = ScoreModelGrader(**grader_kwargs)
+
+        super().__init__(model_config=model_config, grader_config=grader, **kwargs)
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py
@@ -317,13 +317,15 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:
         AzureOpenAILabelGrader,
         AzureOpenAIStringCheckGrader,
         AzureOpenAITextSimilarityGrader,
+        AzureOpenAIScoreModelGrader,
     )
 
     id_map = {
         AzureOpenAIGrader.id: AzureOpenAIGrader,
         AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,
         AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,
         AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,
+        AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,
     }
 
     for key in id_map.keys():
diff --git a/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py b/sdk/evaluation/azure-ai-evaluation/samples/aoai_score_model_grader_sample.py
@@ -0,0 +1,257 @@
+# ---------------------------------------------------------
+# Copyright (c) Microsoft Corporation. All rights reserved.
+# ---------------------------------------------------------
+
+"""
+Sample demonstrating the use of AzureOpenAIScoreModelGrader for continuous
+scoring evaluation.
+
+This sample shows how to:
+1. Configure an Azure OpenAI model for grading
+2. Create a score model grader with custom prompts
+3. Run evaluation using the evaluate() method
+4. Interpret continuous scoring results
+
+Prerequisites:
+- Azure OpenAI resource with API key and endpoint
+- Model deployment (e.g., gpt-4, gpt-4o-mini)
+- Sample conversation data in JSONL format
+- Environment variables configured in .env file
+"""
+
+import json
+import os
+from dotenv import load_dotenv
+import pandas as pd
+from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader
+from azure.ai.evaluation import AzureOpenAIModelConfiguration
+
+# Load environment variables
+load_dotenv()
+
+
+def create_sample_data() -> str:
+    """Create sample conversation data for testing."""
+    sample_conversations = [
+        {
+            "conversation": {
+                "messages": [
+                    {"content": "How can I improve my Python coding skills?", "role": "user"},
+                    {
+                        "content": (
+                            "Here are some effective ways to improve your "
+                            "Python skills: 1) Practice coding daily with "
+                            "platforms like LeetCode or HackerRank, 2) Work "
+                            "on real projects that interest you, 3) Read "
+                            "other people's code on GitHub, 4) Join Python "
+                            "communities and forums, 5) Take online courses "
+                            "or tutorials. The key is consistent practice and "
+                            "building projects that challenge you."
+                        ),
+                        "role": "assistant",
+                    },
+                ]
+            },
+            "expected_quality": "high",
+            "context": "Programming advice request",
+        },
+        {
+            "conversation": {
+                "messages": [
+                    {"content": "What is Python?", "role": "user"},
+                    {"content": "Python is a programming language.", "role": "assistant"},
+                ]
+            },
+            "expected_quality": "low",
+            "context": "Basic definition request",
+        },
+        {
+            "conversation": {
+                "messages": [
+                    {"content": ("Can you explain machine learning concepts " "for a beginner?"), "role": "user"},
+                    {
+                        "content": (
+                            "Machine learning is a subset of artificial "
+                            "intelligence where computers learn patterns from "
+                            "data without being explicitly programmed. Think "
+                            "of it like teaching a child to recognize animals "
+                            "- you show them many pictures of cats and dogs, "
+                            "and eventually they learn to distinguish between "
+                            "them. In ML, we feed algorithms lots of data so "
+                            "they can make predictions or decisions on new, "
+                            "unseen data. Common types include supervised "
+                            "learning (learning from labeled examples), "
+                            "unsupervised learning (finding hidden patterns), "
+                            "and reinforcement learning (learning through "
+                            "trial and error)."
+                        ),
+                        "role": "assistant",
+                    },
+                ]
+            },
+            "expected_quality": "high",
+            "context": "Educational explanation request",
+        },
+        {
+            "conversation": {
+                "messages": [
+                    {"content": "Help me debug this code", "role": "user"},
+                    {"content": "Sure, please share your code.", "role": "assistant"},
+                ]
+            },
+            "expected_quality": "medium",
+            "context": "Debug assistance request",
+        },
+        {
+            "conversation": {
+                "messages": [
+                    {"content": ("What are the best practices for writing " "clean Python code?"), "role": "user"},
+                    {
+                        "content": (
+                            "Here are key best practices for writing clean "
+                            "Python code: 1) Follow PEP 8 style guidelines "
+                            "for consistent formatting, 2) Use descriptive "
+                            "variable and function names that explain their "
+                            "purpose, 3) Write docstrings for functions and "
+                            "classes, 4) Keep functions small and focused on "
+                            "a single task, 5) Use type hints to improve code "
+                            "clarity, 6) Handle exceptions appropriately, "
+                            "7) Write unit tests for your code, 8) Use "
+                            "virtual environments for dependency management, "
+                            "9) Comment complex logic but avoid obvious "
+                            "comments, 10) Refactor code regularly to improve "
+                            "readability and maintainability."
+                        ),
+                        "role": "assistant",
+                    },
+                ]
+            },
+            "expected_quality": "high",
+            "context": "Best practices inquiry",
+        },
+    ]
+
+    # Create JSONL file
+    filename = "sample_conversations.jsonl"
+    with open(filename, "w") as f:
+        for conv in sample_conversations:
+            f.write(json.dumps(conv) + "\n")
+
+    print(f"Created sample data file: {filename}")
+    return filename
+
+
+def demonstrate_score_model_grader():
+    """Demonstrate the AzureOpenAIScoreModelGrader usage with real credentials."""
+
+    # Create sample data
+    data_file = create_sample_data()
+
+    print("=== Azure OpenAI Score Model Grader Demo ===\n")
+
+    try:
+        # 1. Configure Azure OpenAI model using environment variables
+        model_config = AzureOpenAIModelConfiguration(
+            azure_endpoint=os.environ.get("endpoint"),
+            api_key=os.environ.get("key"),
+            azure_deployment=os.environ.get("deployment_name"),
+            api_version="2024-12-01-preview",
+        )
+
+        print("✅ Model configuration loaded successfully")
+
+        # 2. Create conversation quality grader
+        conversation_quality_grader = AzureOpenAIScoreModelGrader(
+            model_config=model_config,
+            name="Conversation Quality Assessment",
+            model="gpt-4o-mini",
+            input=[
+                {
+                    "role": "system",
+                    "content": (
+                        "You are an expert conversation quality evaluator. "
+                        "Assess the quality of AI assistant responses based on "
+                        "helpfulness, completeness, accuracy, and "
+                        "appropriateness. Return a score between 0.0 (very "
+                        "poor) and 1.0 (excellent)."
+                    ),
+                },
+                {
+                    "role": "user",
+                    "content": (
+                        "Evaluate this conversation:\n"
+                        "Context: {{ item.context }}\n"
+                        "Messages: {{ item.conversation }}\n\n"
+                        "Provide a quality score from 0.0 to 1.0."
+                    ),
+                },
+            ],
+            range=[0.0, 1.0],
+            sampling_params={"temperature": 0.0},
+        )
+
+        print("✅ Conversation quality grader created successfully")
+
+        # 3. Run evaluation with the score model grader
+        print("\n🚀 Running evaluation with score model grader...")
+
+        result = evaluate(
+            data=data_file,
+            evaluators={"conversation_quality": conversation_quality_grader},
+            azure_ai_project=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"),
+        )
+
+        # 4. Display results
+        print("\n=== Evaluation Results ===")
+        print(f"Total samples evaluated: {len(result['rows'])}")
+
+        # Show metrics
+        print("\n=== Metrics Summary ===")
+        for metric_name, metric_value in result["metrics"].items():
+            print(f"{metric_name}: {metric_value:.3f}")
+
+        # Show detailed results
+        print("\n=== Sample Results ===")
+        df = pd.DataFrame(result["rows"])
+
+        for i, row in df.head(3).iterrows():
+            print(f"\nSample {i+1}:")
+            print(f"  Context: {row.get('context', 'N/A')}")
+
+            # Show grader results
+            for col in df.columns:
+                if col.startswith("outputs."):
+                    grader_name = col.split(".")[1]
+                    if "score" in col:
+                        print(f"  {grader_name} Score: {row[col]:.3f}")
+                    elif "passed" in col:
+                        print(f"  {grader_name} Passed: {row[col]}")
+
+        print("\n✅ Evaluation completed successfully!")
+
+    except Exception as e:
+        print(f"\n❌ Error during evaluation: {str(e)}")
+
+    # Clean up
+    if os.path.exists(data_file):
+        os.remove(data_file)
+        print(f"\n🧹 Cleaned up temporary file: {data_file}")
+
+
+if __name__ == "__main__":
+    print("🚀 Starting Azure OpenAI Score Model Grader Demo\n")
+
+    # Check if environment variables are set
+    required_vars = ["endpoint", "key", "deployment_name"]
+
+    missing_vars = [var for var in required_vars if not os.environ.get(var)]
+
+    if missing_vars:
+        print("⚠️  Missing environment variables:")
+        for var in missing_vars:
+            print(f"   - {var}")
+    else:
+        print("✅ All environment variables found")
+        demonstrate_score_model_grader()
+
+    print("\n🎉 Demo completed!")
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl b/sdk/evaluation/azure-ai-evaluation/tests/unittests/data/score_model_test_data.jsonl
@@ -0,0 +1,3 @@
+{"conversation": {"messages": [{"role": "user", "content": "How can I improve my Python coding skills?"}, {"role": "assistant", "content": "Here are some effective ways to improve your Python skills: 1) Practice coding daily with platforms like LeetCode or HackerRank, 2) Work on real projects that interest you, 3) Read other people's code on GitHub, 4) Join Python communities and forums, 5) Take online courses or tutorials."}]}, "context": "Programming advice", "expected_quality": "high"}
+{"conversation": {"messages": [{"role": "user", "content": "What is Python?"}, {"role": "assistant", "content": "Python is a programming language."}]}, "context": "Basic definition", "expected_quality": "low"}
+{"conversation": {"messages": [{"role": "user", "content": "Can you explain machine learning for a beginner?"}, {"role": "assistant", "content": "Machine learning is a subset of artificial intelligence where computers learn patterns from data without being explicitly programmed. Think of it like teaching a child to recognize animals - you show them many pictures of cats and dogs, and eventually they learn to distinguish between them."}]}, "context": "Educational explanation", "expected_quality": "high"}
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_save_eval.py
diff --git a/sdk/evaluation/platform-matrix.json b/sdk/evaluation/platform-matrix.json

Original file line number	Diff line number	Diff line change
`@@ -317,13 +317,15 @@ def _get_grader_class(model_id: str) -> Type[AzureOpenAIGrader]:`
`317`	`317`	`AzureOpenAILabelGrader,`
`318`	`318`	`AzureOpenAIStringCheckGrader,`
`319`	`319`	`AzureOpenAITextSimilarityGrader,`
	`320`	`+ AzureOpenAIScoreModelGrader,`
`320`	`321`	`)`
`321`	`322`
`322`	`323`	`id_map = {`
`323`	`324`	`AzureOpenAIGrader.id: AzureOpenAIGrader,`
`324`	`325`	`AzureOpenAILabelGrader.id: AzureOpenAILabelGrader,`
`325`	`326`	`AzureOpenAIStringCheckGrader.id: AzureOpenAIStringCheckGrader,`
`326`	`327`	`AzureOpenAITextSimilarityGrader.id: AzureOpenAITextSimilarityGrader,`
	`328`	`+ AzureOpenAIScoreModelGrader.id: AzureOpenAIScoreModelGrader,`
`327`	`329`	`}`
`328`	`330`
`329`	`331`	`for key in id_map.keys():`
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"conversation": {"messages": [{"role": "user", "content": "How can I improve my Python coding skills?"}, {"role": "assistant", "content": "Here are some effective ways to improve your Python skills: 1) Practice coding daily with platforms like LeetCode or HackerRank, 2) Work on real projects that interest you, 3) Read other people's code on GitHub, 4) Join Python communities and forums, 5) Take online courses or tutorials."}]}, "context": "Programming advice", "expected_quality": "high"}`
	`2`	`+{"conversation": {"messages": [{"role": "user", "content": "What is Python?"}, {"role": "assistant", "content": "Python is a programming language."}]}, "context": "Basic definition", "expected_quality": "low"}`
	`3`	+{"conversation": {"messages": [{"role": "user", "content": "Can you explain machine learning for a beginner?"}, {"role": "assistant", "content": "Machine learning is a subset of artificial intelligence where computers learn patterns from data without being explicitly programmed. Think of it like teaching a child to recognize animals - you show them many pictures of cats and dogs, and eventually they learn to distinguish between them."}]}, "context": "Educational explanation", "expected_quality": "high"}