fix evaluation conversation handling (Azure#38043)

MilesHolland · web-flow · commit 2765bd7a2299 · 2024-10-23T17:32:59.000Z
* fix evaluation conversation handling

* recodings
diff --git a/sdk/evaluation/azure-ai-evaluation/assets.json b/sdk/evaluation/azure-ai-evaluation/assets.json
@@ -2,5 +2,5 @@
   "AssetsRepo": "Azure/azure-sdk-assets",
   "AssetsRepoPrefixPath": "python",
   "TagPrefix": "python/evaluation/azure-ai-evaluation",
-  "Tag": "python/evaluation/azure-ai-evaluation_73f2254a1c"
+  "Tag": "python/evaluation/azure-ai-evaluation_1390701e9d"
 }
diff --git a/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py b/sdk/evaluation/azure-ai-evaluation/azure/ai/evaluation/_evaluators/_common/_base_eval.py
@@ -172,16 +172,16 @@ def converter(conversation: Dict) -> List[DerivedEvalInput]:
                     response_context = response.get("context", None)
                     if global_context:
                         context["global_context"] = global_context
-                    if query_context and not include_query:
+                    if query_context and include_query:
                         context["query_context"] = query_context
-                    if response_context and not include_response:
+                    if response_context and include_response:
                         context["response_context"] = response_context
 
                 eval_input: DerivedEvalInput = {}
                 if include_query:
-                    eval_input["query"] = query
+                    eval_input["query"] = query.get("content", "")
                 if include_response:
-                    eval_input["response"] = response
+                    eval_input["response"] = response.get("content", "")
                 if include_context:
                     eval_input["context"] = str(context)
                 eval_inputs.append(eval_input)
@@ -274,7 +274,6 @@ def _aggregate_results(self, per_turn_results: List[DoEvalResult[T_EvalValue]])
                 aggregated[metric] = list_mean(cast(List[Union[int, float]], values))
         # Slap the per-turn results back in.
         aggregated["evaluation_per_turn"] = evaluation_per_turn
-
         return aggregated
 
     async def _real_call(self, **kwargs) -> Union[DoEvalResult[T_EvalValue], AggregateResult[T_EvalValue]]:
@@ -315,7 +314,7 @@ def __init__(self, real_call):  # DO NOT ADD TYPEHINT PROMPT FLOW WILL SCREAM AT
 
     # Don't look at my shame. Nothing to see here....
     # Oh, you're still here? Ok, the reason this has such a gross call signature and behavior is due
-    # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature#
+    # to our broken async code not properly handling inputs; keyword arguments that aren't in the signature
     # are just not passed into this function instead of ending up in kwargs.
     # Since we want this to be relatively call-agnostic, we just account for every input that any children
     # are known to throw at this, mash them into kwargs, and then pass them into the real call.

Original file line number	Diff line number	Diff line change
`@@ -2,5 +2,5 @@`
`2`	`2`	`"AssetsRepo": "Azure/azure-sdk-assets",`
`3`	`3`	`"AssetsRepoPrefixPath": "python",`
`4`	`4`	`"TagPrefix": "python/evaluation/azure-ai-evaluation",`
`5`		`- "Tag": "python/evaluation/azure-ai-evaluation_73f2254a1c"`
	`5`	`+ "Tag": "python/evaluation/azure-ai-evaluation_1390701e9d"`
`6`	`6`	`}`