Skip to content

Commit 49b66e5

Browse files
authored
Merge pull request #2 from pamelafox/evalsci
New branch for eval
2 parents bfb1b9d + 0cac252 commit 49b66e5

File tree

6 files changed

+70
-145
lines changed

6 files changed

+70
-145
lines changed

.github/workflows/evaluate.yaml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -114,7 +114,7 @@ jobs:
114114
issue_number: context.issue.number,
115115
owner: context.repo.owner,
116116
repo: context.repo.repo,
117-
body: "Starting evaluation! Check the Actions tab for progress, or wait for a comment with the results."
117+
body: "Starting evaluation. Check the Actions tab for progress, or wait for a comment with the results."
118118
})
119119
120120
- name: Checkout pull request
@@ -128,6 +128,7 @@ jobs:
128128
enable-cache: true
129129
version: "0.4.20"
130130
cache-dependency-glob: "requirements**.txt"
131+
python-version: "3.12"
131132

132133
- name: Setup node
133134
uses: actions/setup-node@v4

.github/workflows/python-test.yaml

Lines changed: 0 additions & 80 deletions
This file was deleted.

evals/results/baseline/config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"testdata_path": "ground_truth.jsonl",
3-
"results_dir": "results/experiment<TIMESTAMP>",
4-
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"],
3+
"results_dir": "results/gpt-4o-mini",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
55
"target_url": "http://localhost:50505/chat",
66
"target_parameters": {
77
"overrides": {

evals/results/baseline/eval_results.jsonl

Lines changed: 50 additions & 50 deletions
Large diffs are not rendered by default.

evals/results/baseline/evaluate_parameters.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
{
22
"evaluation_gpt_model": "gpt-4",
3-
"evaluation_timestamp": 1738976093,
3+
"evaluation_timestamp": 1739212680,
44
"testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
5-
"target_url": "https://capps-backend-v3v4cax5h4fjk.mangomushroom-6a80d999.westus.azurecontainerapps.io/chat",
5+
"target_url": "http://localhost:50505/chat",
66
"target_parameters": {
77
"overrides": {
88
"top": 3,

evals/results/baseline/summary.json

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -2,26 +2,30 @@
22
"gpt_groundedness": {
33
"pass_count": 49,
44
"pass_rate": 0.98,
5-
"mean_rating": 4.92
5+
"mean_rating": 4.94
66
},
77
"gpt_relevance": {
88
"pass_count": 49,
99
"pass_rate": 0.98,
10-
"mean_rating": 4.34
10+
"mean_rating": 4.42
1111
},
1212
"answer_length": {
13-
"mean": 634.56,
14-
"max": 1329,
15-
"min": 194
13+
"mean": 667.7,
14+
"max": 1607,
15+
"min": 160
1616
},
1717
"latency": {
18-
"mean": 2.7,
19-
"max": 3.682694,
20-
"min": 2.095263
18+
"mean": 2.96,
19+
"max": 4.377288,
20+
"min": 1.639517
2121
},
2222
"citations_matched": {
23-
"total": 21,
24-
"rate": 0.42
23+
"total": 22,
24+
"rate": 0.45
25+
},
26+
"any_citation": {
27+
"total": 50,
28+
"rate": 1.0
2529
},
2630
"num_questions": {
2731
"total": 50

0 commit comments

Comments
 (0)