|
| 1 | +# --------------------------------------------------------- |
| 2 | +# Copyright (c) Microsoft Corporation. All rights reserved. |
| 3 | +# --------------------------------------------------------- |
| 4 | + |
| 5 | +""" |
| 6 | +Sample demonstrating the use of AzureOpenAIScoreModelGrader for continuous |
| 7 | +scoring evaluation. |
| 8 | +
|
| 9 | +This sample shows how to: |
| 10 | +1. Configure an Azure OpenAI model for grading |
| 11 | +2. Create a score model grader with custom prompts |
| 12 | +3. Run evaluation using the evaluate() method |
| 13 | +4. Interpret continuous scoring results |
| 14 | +
|
| 15 | +Prerequisites: |
| 16 | +- Azure OpenAI resource with API key and endpoint |
| 17 | +- Model deployment (e.g., gpt-4, gpt-4o-mini) |
| 18 | +- Sample conversation data in JSONL format |
| 19 | +- Environment variables configured in .env file |
| 20 | +""" |
| 21 | + |
| 22 | +import json |
| 23 | +import os |
| 24 | +from dotenv import load_dotenv |
| 25 | +import pandas as pd |
| 26 | +from azure.ai.evaluation import evaluate, AzureOpenAIScoreModelGrader |
| 27 | +from azure.ai.evaluation import AzureOpenAIModelConfiguration |
| 28 | + |
| 29 | +# Load environment variables |
| 30 | +load_dotenv() |
| 31 | + |
| 32 | + |
| 33 | +def create_sample_data() -> str: |
| 34 | + """Create sample conversation data for testing.""" |
| 35 | + sample_conversations = [ |
| 36 | + { |
| 37 | + "conversation": { |
| 38 | + "messages": [ |
| 39 | + {"content": "How can I improve my Python coding skills?", "role": "user"}, |
| 40 | + { |
| 41 | + "content": ( |
| 42 | + "Here are some effective ways to improve your " |
| 43 | + "Python skills: 1) Practice coding daily with " |
| 44 | + "platforms like LeetCode or HackerRank, 2) Work " |
| 45 | + "on real projects that interest you, 3) Read " |
| 46 | + "other people's code on GitHub, 4) Join Python " |
| 47 | + "communities and forums, 5) Take online courses " |
| 48 | + "or tutorials. The key is consistent practice and " |
| 49 | + "building projects that challenge you." |
| 50 | + ), |
| 51 | + "role": "assistant", |
| 52 | + }, |
| 53 | + ] |
| 54 | + }, |
| 55 | + "expected_quality": "high", |
| 56 | + "context": "Programming advice request", |
| 57 | + }, |
| 58 | + { |
| 59 | + "conversation": { |
| 60 | + "messages": [ |
| 61 | + {"content": "What is Python?", "role": "user"}, |
| 62 | + {"content": "Python is a programming language.", "role": "assistant"}, |
| 63 | + ] |
| 64 | + }, |
| 65 | + "expected_quality": "low", |
| 66 | + "context": "Basic definition request", |
| 67 | + }, |
| 68 | + { |
| 69 | + "conversation": { |
| 70 | + "messages": [ |
| 71 | + {"content": ("Can you explain machine learning concepts " "for a beginner?"), "role": "user"}, |
| 72 | + { |
| 73 | + "content": ( |
| 74 | + "Machine learning is a subset of artificial " |
| 75 | + "intelligence where computers learn patterns from " |
| 76 | + "data without being explicitly programmed. Think " |
| 77 | + "of it like teaching a child to recognize animals " |
| 78 | + "- you show them many pictures of cats and dogs, " |
| 79 | + "and eventually they learn to distinguish between " |
| 80 | + "them. In ML, we feed algorithms lots of data so " |
| 81 | + "they can make predictions or decisions on new, " |
| 82 | + "unseen data. Common types include supervised " |
| 83 | + "learning (learning from labeled examples), " |
| 84 | + "unsupervised learning (finding hidden patterns), " |
| 85 | + "and reinforcement learning (learning through " |
| 86 | + "trial and error)." |
| 87 | + ), |
| 88 | + "role": "assistant", |
| 89 | + }, |
| 90 | + ] |
| 91 | + }, |
| 92 | + "expected_quality": "high", |
| 93 | + "context": "Educational explanation request", |
| 94 | + }, |
| 95 | + { |
| 96 | + "conversation": { |
| 97 | + "messages": [ |
| 98 | + {"content": "Help me debug this code", "role": "user"}, |
| 99 | + {"content": "Sure, please share your code.", "role": "assistant"}, |
| 100 | + ] |
| 101 | + }, |
| 102 | + "expected_quality": "medium", |
| 103 | + "context": "Debug assistance request", |
| 104 | + }, |
| 105 | + { |
| 106 | + "conversation": { |
| 107 | + "messages": [ |
| 108 | + {"content": ("What are the best practices for writing " "clean Python code?"), "role": "user"}, |
| 109 | + { |
| 110 | + "content": ( |
| 111 | + "Here are key best practices for writing clean " |
| 112 | + "Python code: 1) Follow PEP 8 style guidelines " |
| 113 | + "for consistent formatting, 2) Use descriptive " |
| 114 | + "variable and function names that explain their " |
| 115 | + "purpose, 3) Write docstrings for functions and " |
| 116 | + "classes, 4) Keep functions small and focused on " |
| 117 | + "a single task, 5) Use type hints to improve code " |
| 118 | + "clarity, 6) Handle exceptions appropriately, " |
| 119 | + "7) Write unit tests for your code, 8) Use " |
| 120 | + "virtual environments for dependency management, " |
| 121 | + "9) Comment complex logic but avoid obvious " |
| 122 | + "comments, 10) Refactor code regularly to improve " |
| 123 | + "readability and maintainability." |
| 124 | + ), |
| 125 | + "role": "assistant", |
| 126 | + }, |
| 127 | + ] |
| 128 | + }, |
| 129 | + "expected_quality": "high", |
| 130 | + "context": "Best practices inquiry", |
| 131 | + }, |
| 132 | + ] |
| 133 | + |
| 134 | + # Create JSONL file |
| 135 | + filename = "sample_conversations.jsonl" |
| 136 | + with open(filename, "w") as f: |
| 137 | + for conv in sample_conversations: |
| 138 | + f.write(json.dumps(conv) + "\n") |
| 139 | + |
| 140 | + print(f"Created sample data file: {filename}") |
| 141 | + return filename |
| 142 | + |
| 143 | + |
| 144 | +def demonstrate_score_model_grader(): |
| 145 | + """Demonstrate the AzureOpenAIScoreModelGrader usage with real credentials.""" |
| 146 | + |
| 147 | + # Create sample data |
| 148 | + data_file = create_sample_data() |
| 149 | + |
| 150 | + print("=== Azure OpenAI Score Model Grader Demo ===\n") |
| 151 | + |
| 152 | + try: |
| 153 | + # 1. Configure Azure OpenAI model using environment variables |
| 154 | + model_config = AzureOpenAIModelConfiguration( |
| 155 | + azure_endpoint=os.environ.get("endpoint"), |
| 156 | + api_key=os.environ.get("key"), |
| 157 | + azure_deployment=os.environ.get("deployment_name"), |
| 158 | + api_version="2024-12-01-preview", |
| 159 | + ) |
| 160 | + |
| 161 | + print("✅ Model configuration loaded successfully") |
| 162 | + |
| 163 | + # 2. Create conversation quality grader |
| 164 | + conversation_quality_grader = AzureOpenAIScoreModelGrader( |
| 165 | + model_config=model_config, |
| 166 | + name="Conversation Quality Assessment", |
| 167 | + model="gpt-4o-mini", |
| 168 | + input=[ |
| 169 | + { |
| 170 | + "role": "system", |
| 171 | + "content": ( |
| 172 | + "You are an expert conversation quality evaluator. " |
| 173 | + "Assess the quality of AI assistant responses based on " |
| 174 | + "helpfulness, completeness, accuracy, and " |
| 175 | + "appropriateness. Return a score between 0.0 (very " |
| 176 | + "poor) and 1.0 (excellent)." |
| 177 | + ), |
| 178 | + }, |
| 179 | + { |
| 180 | + "role": "user", |
| 181 | + "content": ( |
| 182 | + "Evaluate this conversation:\n" |
| 183 | + "Context: {{ item.context }}\n" |
| 184 | + "Messages: {{ item.conversation }}\n\n" |
| 185 | + "Provide a quality score from 0.0 to 1.0." |
| 186 | + ), |
| 187 | + }, |
| 188 | + ], |
| 189 | + range=[0.0, 1.0], |
| 190 | + sampling_params={"temperature": 0.0}, |
| 191 | + ) |
| 192 | + |
| 193 | + print("✅ Conversation quality grader created successfully") |
| 194 | + |
| 195 | + # 3. Run evaluation with the score model grader |
| 196 | + print("\n🚀 Running evaluation with score model grader...") |
| 197 | + |
| 198 | + result = evaluate( |
| 199 | + data=data_file, |
| 200 | + evaluators={"conversation_quality": conversation_quality_grader}, |
| 201 | + azure_ai_project=os.environ.get("AZURE_AI_PROJECT_ENDPOINT"), |
| 202 | + ) |
| 203 | + |
| 204 | + # 4. Display results |
| 205 | + print("\n=== Evaluation Results ===") |
| 206 | + print(f"Total samples evaluated: {len(result['rows'])}") |
| 207 | + |
| 208 | + # Show metrics |
| 209 | + print("\n=== Metrics Summary ===") |
| 210 | + for metric_name, metric_value in result["metrics"].items(): |
| 211 | + print(f"{metric_name}: {metric_value:.3f}") |
| 212 | + |
| 213 | + # Show detailed results |
| 214 | + print("\n=== Sample Results ===") |
| 215 | + df = pd.DataFrame(result["rows"]) |
| 216 | + |
| 217 | + for i, row in df.head(3).iterrows(): |
| 218 | + print(f"\nSample {i+1}:") |
| 219 | + print(f" Context: {row.get('context', 'N/A')}") |
| 220 | + |
| 221 | + # Show grader results |
| 222 | + for col in df.columns: |
| 223 | + if col.startswith("outputs."): |
| 224 | + grader_name = col.split(".")[1] |
| 225 | + if "score" in col: |
| 226 | + print(f" {grader_name} Score: {row[col]:.3f}") |
| 227 | + elif "passed" in col: |
| 228 | + print(f" {grader_name} Passed: {row[col]}") |
| 229 | + |
| 230 | + print("\n✅ Evaluation completed successfully!") |
| 231 | + |
| 232 | + except Exception as e: |
| 233 | + print(f"\n❌ Error during evaluation: {str(e)}") |
| 234 | + |
| 235 | + # Clean up |
| 236 | + if os.path.exists(data_file): |
| 237 | + os.remove(data_file) |
| 238 | + print(f"\n🧹 Cleaned up temporary file: {data_file}") |
| 239 | + |
| 240 | + |
| 241 | +if __name__ == "__main__": |
| 242 | + print("🚀 Starting Azure OpenAI Score Model Grader Demo\n") |
| 243 | + |
| 244 | + # Check if environment variables are set |
| 245 | + required_vars = ["endpoint", "key", "deployment_name"] |
| 246 | + |
| 247 | + missing_vars = [var for var in required_vars if not os.environ.get(var)] |
| 248 | + |
| 249 | + if missing_vars: |
| 250 | + print("⚠️ Missing environment variables:") |
| 251 | + for var in missing_vars: |
| 252 | + print(f" - {var}") |
| 253 | + else: |
| 254 | + print("✅ All environment variables found") |
| 255 | + demonstrate_score_model_grader() |
| 256 | + |
| 257 | + print("\n🎉 Demo completed!") |
0 commit comments