Add ask evaluation, update snapshots

pamelafox · pamelafox · commit 0ee4f213e7c3 · 2025-09-04T11:45:02.000-07:00
diff --git a/evals/evaluate_config.json b/evals/evaluate_config.json
@@ -1,8 +1,8 @@
 {
     "testdata_path": "ground_truth.jsonl",
-    "results_dir": "results/experiment<TIMESTAMP>",
+    "results_dir": "results/baseline-ask",
     "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
-    "target_url": "http://localhost:50505/chat",
+    "target_url": "http://localhost:50505/ask",
     "target_parameters": {
         "overrides": {
             "top": 3,
diff --git a/evals/results/baseline-ask/config.json b/evals/results/baseline-ask/config.json
@@ -0,0 +1,33 @@
+{
+    "testdata_path": "ground_truth.jsonl",
+    "results_dir": "results/baseline-ask",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
+    "target_url": "http://localhost:50505/ask",
+    "target_parameters": {
+        "overrides": {
+            "top": 3,
+            "max_subqueries": 10,
+            "results_merge_strategy": "interleaved",
+            "temperature": 0.3,
+            "minimum_reranker_score": 0,
+            "minimum_search_score": 0,
+            "retrieval_mode": "hybrid",
+            "semantic_ranker": true,
+            "semantic_captions": false,
+            "query_rewriting": false,
+            "reasoning_effort": "minimal",
+            "suggest_followup_questions": false,
+            "use_oid_security_filter": false,
+            "use_groups_security_filter": false,
+            "search_text_embeddings": true,
+            "search_image_embeddings": true,
+            "send_text_sources": true,
+            "send_image_sources": true,
+            "language": "en",
+            "use_agentic_retrieval": false,
+            "seed": 1
+        }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points.text"
+}
diff --git a/evals/results/baseline-ask/eval_results.jsonl b/evals/results/baseline-ask/eval_results.jsonl
diff --git a/evals/results/baseline-ask/evaluate_parameters.json b/evals/results/baseline-ask/evaluate_parameters.json
@@ -0,0 +1,32 @@
+{
+    "evaluation_gpt_model": "gpt-4o",
+    "evaluation_timestamp": 1757011333,
+    "testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
+    "target_url": "http://localhost:50505/ask",
+    "target_parameters": {
+        "overrides": {
+            "top": 3,
+            "max_subqueries": 10,
+            "results_merge_strategy": "interleaved",
+            "temperature": 0.3,
+            "minimum_reranker_score": 0,
+            "minimum_search_score": 0,
+            "retrieval_mode": "hybrid",
+            "semantic_ranker": true,
+            "semantic_captions": false,
+            "query_rewriting": false,
+            "reasoning_effort": "minimal",
+            "suggest_followup_questions": false,
+            "use_oid_security_filter": false,
+            "use_groups_security_filter": false,
+            "search_text_embeddings": true,
+            "search_image_embeddings": true,
+            "send_text_sources": true,
+            "send_image_sources": true,
+            "language": "en",
+            "use_agentic_retrieval": false,
+            "seed": 1
+        }
+    },
+    "num_questions": null
+}
diff --git a/evals/results/baseline-ask/summary.json b/evals/results/baseline-ask/summary.json
@@ -0,0 +1,33 @@
+{
+    "gpt_groundedness": {
+        "pass_count": 49,
+        "pass_rate": 0.98,
+        "mean_rating": 4.88
+    },
+    "gpt_relevance": {
+        "pass_count": 46,
+        "pass_rate": 0.92,
+        "mean_rating": 4.32
+    },
+    "answer_length": {
+        "mean": 758.08,
+        "max": 1403,
+        "min": 193
+    },
+    "latency": {
+        "mean": 3.62,
+        "max": 6.500667,
+        "min": -1.0
+    },
+    "citations_matched": {
+        "total": 22,
+        "rate": 0.44
+    },
+    "any_citation": {
+        "total": 49,
+        "rate": 0.98
+    },
+    "num_questions": {
+        "total": 50
+    }
+}
diff --git a/tests/snapshots/test_app/test_ask_prompt_template_concat/client0/result.json b/tests/snapshots/test_app/test_ask_prompt_template_concat/client0/result.json
@@ -54,7 +54,7 @@
             {
                 "description": [
                     {
-                        "content": "You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions.\nUse 'you' to refer to the individual asking the questions even if they ask with 'I'.\nAnswer the following question using only the data provided in the sources below.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\nIf you cannot answer using the sources below, say you don't know. Use below example to answer.\n\nPossible citations for current question:\n\n[Benefit_Options-2.pdf]\n\n Meow like a cat.",
+                        "content": "Assistant helps the company employees with their questions about internal documents. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.\nYou CANNOT ask clarifying questions to the user, since the user will have no way to reply.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\n\nPossible citations for current question:  [Benefit_Options-2.pdf] \n Meow like a cat.",
                         "role": "system"
                     },
                     {
diff --git a/tests/snapshots/test_app/test_ask_prompt_template_concat/client1/result.json b/tests/snapshots/test_app/test_ask_prompt_template_concat/client1/result.json
@@ -54,7 +54,7 @@
             {
                 "description": [
                     {
-                        "content": "You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions.\nUse 'you' to refer to the individual asking the questions even if they ask with 'I'.\nAnswer the following question using only the data provided in the sources below.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\nIf you cannot answer using the sources below, say you don't know. Use below example to answer.\n\nPossible citations for current question:\n\n[Benefit_Options-2.pdf]\n\n Meow like a cat.",
+                        "content": "Assistant helps the company employees with their questions about internal documents. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.\nYou CANNOT ask clarifying questions to the user, since the user will have no way to reply.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\n\nPossible citations for current question:  [Benefit_Options-2.pdf] \n Meow like a cat.",
                         "role": "system"
                     },
                     {
diff --git a/tests/test_app.py b/tests/test_app.py
@@ -702,7 +702,9 @@ async def test_ask_prompt_template_concat(client, snapshot):
     )
     assert response.status_code == 200
     result = await response.get_json()
-    assert result["context"]["thoughts"][2]["description"][0]["content"].startswith("You are an intelligent assistant")
+    assert result["context"]["thoughts"][2]["description"][0]["content"].startswith(
+        "Assistant helps the company employees"
+    )
     assert result["context"]["thoughts"][2]["description"][0]["content"].endswith("Meow like a cat.")
     snapshot.assert_match(json.dumps(result, indent=4), "result.json")
 

Original file line number	Diff line number	Diff line change
`@@ -54,7 +54,7 @@`
`54`	`54`	`{`
`55`	`55`	`"description": [`
`56`	`56`	`{`
`57`		- "content": "You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions.\nUse 'you' to refer to the individual asking the questions even if they ask with 'I'.\nAnswer the following question using only the data provided in the sources below.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\nIf you cannot answer using the sources below, say you don't know. Use below example to answer.\n\nPossible citations for current question:\n\n[Benefit_Options-2.pdf]\n\n Meow like a cat.",
	`57`	+ "content": "Assistant helps the company employees with their questions about internal documents. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.\nYou CANNOT ask clarifying questions to the user, since the user will have no way to reply.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\n\nPossible citations for current question: [Benefit_Options-2.pdf] \n Meow like a cat.",
`58`	`58`	`"role": "system"`
`59`	`59`	`},`
`60`	`60`	`{`