Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/evaluate.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ jobs:
issue_number: context.issue.number,
owner: context.repo.owner,
repo: context.repo.repo,
body: "Starting evaluation! Check the Actions tab for progress, or wait for a comment with the results."
body: "Starting evaluation. Check the Actions tab for progress, or wait for a comment with the results."
})

- name: Checkout pull request
Expand All @@ -128,6 +128,7 @@ jobs:
enable-cache: true
version: "0.4.20"
cache-dependency-glob: "requirements**.txt"
python-version: "3.12"

- name: Setup node
uses: actions/setup-node@v4
Expand Down
80 changes: 0 additions & 80 deletions .github/workflows/python-test.yaml

This file was deleted.

4 changes: 2 additions & 2 deletions evals/results/baseline/config.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"testdata_path": "ground_truth.jsonl",
"results_dir": "results/experiment<TIMESTAMP>",
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"],
"results_dir": "results/gpt-4o-mini",
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
"target_url": "http://localhost:50505/chat",
"target_parameters": {
"overrides": {
Expand Down
100 changes: 50 additions & 50 deletions evals/results/baseline/eval_results.jsonl

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions evals/results/baseline/evaluate_parameters.json
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
{
"evaluation_gpt_model": "gpt-4",
"evaluation_timestamp": 1738976093,
"evaluation_timestamp": 1739212680,
"testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
"target_url": "https://capps-backend-v3v4cax5h4fjk.mangomushroom-6a80d999.westus.azurecontainerapps.io/chat",
"target_url": "http://localhost:50505/chat",
"target_parameters": {
"overrides": {
"top": 3,
Expand Down
24 changes: 14 additions & 10 deletions evals/results/baseline/summary.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,30 @@
"gpt_groundedness": {
"pass_count": 49,
"pass_rate": 0.98,
"mean_rating": 4.92
"mean_rating": 4.94
},
"gpt_relevance": {
"pass_count": 49,
"pass_rate": 0.98,
"mean_rating": 4.34
"mean_rating": 4.42
},
"answer_length": {
"mean": 634.56,
"max": 1329,
"min": 194
"mean": 667.7,
"max": 1607,
"min": 160
},
"latency": {
"mean": 2.7,
"max": 3.682694,
"min": 2.095263
"mean": 2.96,
"max": 4.377288,
"min": 1.639517
},
"citations_matched": {
"total": 21,
"rate": 0.42
"total": 22,
"rate": 0.45
},
"any_citation": {
"total": 50,
"rate": 1.0
},
"num_questions": {
"total": 50
Expand Down