Skip to content

Commit 0ee4f21

Browse files
committed
Add ask evaluation, update snapshots
1 parent ce0f7c1 commit 0ee4f21

File tree

8 files changed

+155
-5
lines changed

8 files changed

+155
-5
lines changed

evals/evaluate_config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"testdata_path": "ground_truth.jsonl",
3-
"results_dir": "results/experiment<TIMESTAMP>",
3+
"results_dir": "results/baseline-ask",
44
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
5-
"target_url": "http://localhost:50505/chat",
5+
"target_url": "http://localhost:50505/ask",
66
"target_parameters": {
77
"overrides": {
88
"top": 3,
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"testdata_path": "ground_truth.jsonl",
3+
"results_dir": "results/baseline-ask",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
5+
"target_url": "http://localhost:50505/ask",
6+
"target_parameters": {
7+
"overrides": {
8+
"top": 3,
9+
"max_subqueries": 10,
10+
"results_merge_strategy": "interleaved",
11+
"temperature": 0.3,
12+
"minimum_reranker_score": 0,
13+
"minimum_search_score": 0,
14+
"retrieval_mode": "hybrid",
15+
"semantic_ranker": true,
16+
"semantic_captions": false,
17+
"query_rewriting": false,
18+
"reasoning_effort": "minimal",
19+
"suggest_followup_questions": false,
20+
"use_oid_security_filter": false,
21+
"use_groups_security_filter": false,
22+
"search_text_embeddings": true,
23+
"search_image_embeddings": true,
24+
"send_text_sources": true,
25+
"send_image_sources": true,
26+
"language": "en",
27+
"use_agentic_retrieval": false,
28+
"seed": 1
29+
}
30+
},
31+
"target_response_answer_jmespath": "message.content",
32+
"target_response_context_jmespath": "context.data_points.text"
33+
}

evals/results/baseline-ask/eval_results.jsonl

Lines changed: 50 additions & 0 deletions
Large diffs are not rendered by default.
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
{
2+
"evaluation_gpt_model": "gpt-4o",
3+
"evaluation_timestamp": 1757011333,
4+
"testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
5+
"target_url": "http://localhost:50505/ask",
6+
"target_parameters": {
7+
"overrides": {
8+
"top": 3,
9+
"max_subqueries": 10,
10+
"results_merge_strategy": "interleaved",
11+
"temperature": 0.3,
12+
"minimum_reranker_score": 0,
13+
"minimum_search_score": 0,
14+
"retrieval_mode": "hybrid",
15+
"semantic_ranker": true,
16+
"semantic_captions": false,
17+
"query_rewriting": false,
18+
"reasoning_effort": "minimal",
19+
"suggest_followup_questions": false,
20+
"use_oid_security_filter": false,
21+
"use_groups_security_filter": false,
22+
"search_text_embeddings": true,
23+
"search_image_embeddings": true,
24+
"send_text_sources": true,
25+
"send_image_sources": true,
26+
"language": "en",
27+
"use_agentic_retrieval": false,
28+
"seed": 1
29+
}
30+
},
31+
"num_questions": null
32+
}
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"gpt_groundedness": {
3+
"pass_count": 49,
4+
"pass_rate": 0.98,
5+
"mean_rating": 4.88
6+
},
7+
"gpt_relevance": {
8+
"pass_count": 46,
9+
"pass_rate": 0.92,
10+
"mean_rating": 4.32
11+
},
12+
"answer_length": {
13+
"mean": 758.08,
14+
"max": 1403,
15+
"min": 193
16+
},
17+
"latency": {
18+
"mean": 3.62,
19+
"max": 6.500667,
20+
"min": -1.0
21+
},
22+
"citations_matched": {
23+
"total": 22,
24+
"rate": 0.44
25+
},
26+
"any_citation": {
27+
"total": 49,
28+
"rate": 0.98
29+
},
30+
"num_questions": {
31+
"total": 50
32+
}
33+
}

tests/snapshots/test_app/test_ask_prompt_template_concat/client0/result.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
{
5555
"description": [
5656
{
57-
"content": "You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions.\nUse 'you' to refer to the individual asking the questions even if they ask with 'I'.\nAnswer the following question using only the data provided in the sources below.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\nIf you cannot answer using the sources below, say you don't know. Use below example to answer.\n\nPossible citations for current question:\n\n[Benefit_Options-2.pdf]\n\n Meow like a cat.",
57+
"content": "Assistant helps the company employees with their questions about internal documents. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.\nYou CANNOT ask clarifying questions to the user, since the user will have no way to reply.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\n\nPossible citations for current question: [Benefit_Options-2.pdf] \n Meow like a cat.",
5858
"role": "system"
5959
},
6060
{

tests/snapshots/test_app/test_ask_prompt_template_concat/client1/result.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,7 +54,7 @@
5454
{
5555
"description": [
5656
{
57-
"content": "You are an intelligent assistant helping Contoso Inc employees with their healthcare plan questions and employee handbook questions.\nUse 'you' to refer to the individual asking the questions even if they ask with 'I'.\nAnswer the following question using only the data provided in the sources below.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\nIf you cannot answer using the sources below, say you don't know. Use below example to answer.\n\nPossible citations for current question:\n\n[Benefit_Options-2.pdf]\n\n Meow like a cat.",
57+
"content": "Assistant helps the company employees with their questions about internal documents. Be brief in your answers.\nAnswer ONLY with the facts listed in the list of sources below. If there isn't enough information below, say you don't know. Do not generate answers that don't use the sources below.\nYou CANNOT ask clarifying questions to the user, since the user will have no way to reply.\nIf the question is not in English, answer in the language used in the question.\nEach source has a name followed by colon and the actual information, always include the source name for each fact you use in the response. Use square brackets to reference the source, for example [info1.txt]. Don't combine sources, list each source separately, for example [info1.txt][info2.pdf].\n\nPossible citations for current question: [Benefit_Options-2.pdf] \n Meow like a cat.",
5858
"role": "system"
5959
},
6060
{

tests/test_app.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -702,7 +702,9 @@ async def test_ask_prompt_template_concat(client, snapshot):
702702
)
703703
assert response.status_code == 200
704704
result = await response.get_json()
705-
assert result["context"]["thoughts"][2]["description"][0]["content"].startswith("You are an intelligent assistant")
705+
assert result["context"]["thoughts"][2]["description"][0]["content"].startswith(
706+
"Assistant helps the company employees"
707+
)
706708
assert result["context"]["thoughts"][2]["description"][0]["content"].endswith("Meow like a cat.")
707709
snapshot.assert_match(json.dumps(result, indent=4), "result.json")
708710

0 commit comments

Comments
 (0)