Merge pull request #2 from pamelafox/evalsci

pamelafox · web-flow · commit 49b66e51c476 · 2025-02-10T11:14:45.000-08:00
New branch for eval
diff --git a/.github/workflows/evaluate.yaml b/.github/workflows/evaluate.yaml
@@ -114,7 +114,7 @@ jobs:
               issue_number: context.issue.number,
               owner: context.repo.owner,
               repo: context.repo.repo,
-              body: "Starting evaluation! Check the Actions tab for progress, or wait for a comment with the results."
+              body: "Starting evaluation. Check the Actions tab for progress, or wait for a comment with the results."
             })
 
       - name: Checkout pull request
@@ -128,6 +128,7 @@ jobs:
           enable-cache: true
           version: "0.4.20"
           cache-dependency-glob: "requirements**.txt"
+          python-version: "3.12"
 
       - name: Setup node
         uses: actions/setup-node@v4
diff --git a/.github/workflows/python-test.yaml b/.github/workflows/python-test.yaml
diff --git a/evals/results/baseline/config.json b/evals/results/baseline/config.json
@@ -1,7 +1,7 @@
 {
     "testdata_path": "ground_truth.jsonl",
-    "results_dir": "results/experiment<TIMESTAMP>",
-    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"],
+    "results_dir": "results/gpt-4o-mini",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
     "target_url": "http://localhost:50505/chat",
     "target_parameters": {
         "overrides": {
diff --git a/evals/results/baseline/eval_results.jsonl b/evals/results/baseline/eval_results.jsonl
diff --git a/evals/results/baseline/evaluate_parameters.json b/evals/results/baseline/evaluate_parameters.json
@@ -1,8 +1,8 @@
 {
     "evaluation_gpt_model": "gpt-4",
-    "evaluation_timestamp": 1738976093,
+    "evaluation_timestamp": 1739212680,
     "testdata_path": "/Users/pamelafox/azure-search-openai-demo/evals/ground_truth.jsonl",
-    "target_url": "https://capps-backend-v3v4cax5h4fjk.mangomushroom-6a80d999.westus.azurecontainerapps.io/chat",
+    "target_url": "http://localhost:50505/chat",
     "target_parameters": {
         "overrides": {
             "top": 3,
diff --git a/evals/results/baseline/summary.json b/evals/results/baseline/summary.json
@@ -2,26 +2,30 @@
     "gpt_groundedness": {
         "pass_count": 49,
         "pass_rate": 0.98,
-        "mean_rating": 4.92
+        "mean_rating": 4.94
     },
     "gpt_relevance": {
         "pass_count": 49,
         "pass_rate": 0.98,
-        "mean_rating": 4.34
+        "mean_rating": 4.42
     },
     "answer_length": {
-        "mean": 634.56,
-        "max": 1329,
-        "min": 194
+        "mean": 667.7,
+        "max": 1607,
+        "min": 160
     },
     "latency": {
-        "mean": 2.7,
-        "max": 3.682694,
-        "min": 2.095263
+        "mean": 2.96,
+        "max": 4.377288,
+        "min": 1.639517
     },
     "citations_matched": {
-        "total": 21,
-        "rate": 0.42
+        "total": 22,
+        "rate": 0.45
+    },
+    "any_citation": {
+        "total": 50,
+        "rate": 1.0
     },
     "num_questions": {
         "total": 50