Chore/support updated oai sdk (#43053)

nagkumar91 · Nagkumar Arkalgud · Nagkumar Arkalgud · slister1001 · commit a67562e82eca · 2025-09-18T17:27:28.000-04:00
* Prepare evals SDK Release

* Fix bug

* Fix for ADV_CONV for FDP projects

* Update release date

* re-add pyrit to matrix

* Change grader ids

* Update unit test

* replace all old grader IDs in tests

* Update platform-matrix.json

Add pyrit and not remove the other one

* Update test to ensure everything is mocked

* tox/black fixes

* Skip that test with issues

* update grader ID according to API View feedback

* Update test

* remove string check for grader ID

* Update changelog and officialy start freeze

* update the enum according to suggestions

* update the changelog

* Finalize logic

* Initial plan

* Fix client request ID headers in azure-ai-evaluation

Co-authored-by: nagkumar91 &lt;4727422+nagkumar91@users.noreply.github.com&gt;

* Fix client request ID header format in rai_service.py

Co-authored-by: nagkumar91 &lt;4727422+nagkumar91@users.noreply.github.com&gt;

* Passing threshold in AzureOpenAIScoreModelGrader

* Add changelog

* Adding the self.pass_threshold instead of pass_threshold

* Add the python grader

* Remove redundant test

* Add class to exception list and format code

* Add properties to evaluation upload run for FDP

* Remove debug

* Remove the redundant property

* Fix changelog

* Fix the multiple features added section

* removed the properties in update

* Updates to support aoai 1.108.0 version with updated types for evals

* Black reformat

* Handle case when AOAI returns dict

---------

Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@naarkalg-work-mac.local&gt;
Co-authored-by: Nagkumar Arkalgud &lt;nagkumar@Mac.lan&gt;
Co-authored-by: copilot-swe-agent[bot] &lt;198982749+Copilot@users.noreply.github.com&gt;
Co-authored-by: nagkumar91 &lt;4727422+nagkumar91@users.noreply.github.com&gt;
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluate/_evaluate_aoai.py
@@ -272,8 +272,33 @@ def _get_single_run_results(
     for row_result in all_results:
         listed_results["index"].append(row_result.datasource_item_id)
         for single_grader_row_result in row_result.results:
-            grader_name = run_info["grader_name_map"][single_grader_row_result["name"]]
-            for name, value in single_grader_row_result.items():
+            if isinstance(single_grader_row_result, dict):
+                result_dict = single_grader_row_result
+            elif hasattr(single_grader_row_result, "model_dump"):
+                result_dict = single_grader_row_result.model_dump()
+            elif hasattr(single_grader_row_result, "dict"):
+                result_dict = single_grader_row_result.dict()
+            elif hasattr(single_grader_row_result, "__dict__"):
+                result_dict = vars(single_grader_row_result)
+            else:
+                raise EvaluationException(
+                    message=("Unsupported AOAI evaluation result type: " f"{type(single_grader_row_result)!r}."),
+                    blame=ErrorBlame.UNKNOWN,
+                    category=ErrorCategory.FAILED_EXECUTION,
+                    target=ErrorTarget.AOAI_GRADER,
+                )
+
+            grader_result_name = result_dict.get("name", None)
+            if grader_result_name is None:
+                raise EvaluationException(
+                    message="AOAI evaluation response missing grader result name; unable to map to original grader.",
+                    blame=ErrorBlame.UNKNOWN,
+                    category=ErrorCategory.FAILED_EXECUTION,
+                    target=ErrorTarget.AOAI_GRADER,
+                )
+
+            grader_name = run_info["grader_name_map"][grader_result_name]
+            for name, value in result_dict.items():
                 if name in ["name"]:
                     continue
                 if name.lower() == "passed":
diff --git a/sdk/evaluation/azure-ai-evaluation/setup.py b/sdk/evaluation/azure-ai-evaluation/setup.py
@@ -75,7 +75,7 @@
         "httpx>=0.25.1",
         # Dependencies added since Promptflow will soon be made optional
         "pandas>=2.1.2,<3.0.0",
-        "openai>=1.78.0",
+        "openai>=1.108.0",
         "ruamel.yaml>=0.17.10,<1.0.0",
         "msrest>=0.6.21",
         "Jinja2>=3.1.6",
diff --git a/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py b/sdk/evaluation/azure-ai-evaluation/tests/unittests/test_aoai_score_model_grader.py
@@ -20,6 +20,22 @@
 )
 
 
+def _sampling_params_as_dict(value):
+    """Normalize sampling params to a plain dictionary for assertions."""
+
+    if value is None:
+        return {}
+    if isinstance(value, dict):
+        return value
+    if hasattr(value, "model_dump"):
+        return value.model_dump(exclude_none=True)
+    if hasattr(value, "dict"):
+        return value.dict(exclude_none=True)
+    if hasattr(value, "__dict__"):
+        return {k: v for k, v in vars(value).items() if v is not None and not k.startswith("_")}
+    return value
+
+
 @pytest.fixture
 def mock_aoai_model_config():
     """Mock Azure OpenAI model configuration for testing."""
@@ -200,8 +216,9 @@ def test_grader_config_properties(self, mock_aoai_model_config, basic_score_grad
         assert config.input[0].role == "system"
         assert config.input[1].role == "user"
         assert config.range == [0.0, 1.0]
-        assert config.sampling_params["temperature"] == 0.0
-        assert config.sampling_params["max_tokens"] == 100
+        sampling_params = _sampling_params_as_dict(config.sampling_params)
+        assert sampling_params["temperature"] == 0.0
+        assert sampling_params["max_tokens"] == 100
         assert grader.pass_threshold == 0.5
 
     def test_different_score_ranges(self, mock_aoai_model_config):
@@ -534,7 +551,7 @@ def test_grader_with_complex_sampling_params(self, mock_aoai_model_config):
             sampling_params=complex_params,
         )
 
-        assert grader._grader_config.sampling_params == complex_params
+        assert _sampling_params_as_dict(grader._grader_config.sampling_params) == complex_params
 
     def test_grader_with_unicode_content(self, mock_aoai_model_config):
         """Test grader with Unicode and special characters in content."""