Azure-Samples
diff --git a/‎docs/evaluation.md‎
Lines changed: 84 additions & 0 deletions b/‎docs/evaluation.md‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎evals/evaluate.py‎
Lines changed: 58 additions & 0 deletions b/‎evals/evaluate.py‎
Lines changed: 58 additions & 0 deletions
diff --git a/‎evals/evaluate_config.json‎
Lines changed: 28 additions & 0 deletions b/‎evals/evaluate_config.json‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎evals/generate_config.json‎
Lines changed: 35 additions & 31 deletions b/‎evals/generate_config.json‎
Lines changed: 35 additions & 31 deletions
diff --git a/‎evals/generate_ground_truth.py‎
Lines changed: 2 additions & 14 deletions b/‎evals/generate_ground_truth.py‎
Lines changed: 2 additions & 14 deletions
diff --git a/‎evals/ground_truth_singleturn.jsonl‎ renamed to ‎evals/ground_truth.jsonl‎ b/‎evals/ground_truth_singleturn.jsonl‎ renamed to ‎evals/ground_truth.jsonl‎
diff --git a/‎evals/requirements.txt‎
Lines changed: 1 addition & 0 deletions b/‎evals/requirements.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎evals/results/baseline/config.json‎
Lines changed: 28 additions & 0 deletions b/‎evals/results/baseline/config.json‎
Lines changed: 28 additions & 0 deletions
@@ -0,0 +1,84 @@
+# Evaluating the RAG answer quality
+
+Follow these steps to evaluate the quality of the answers generated by the RAG flow.
+
+* [Deploy a GPT-4 model](#deploy-a-gpt-4-model)
+* [Setup the evaluation environment](#setup-the-evaluation-environment)
+* [Generate ground truth data](#generate-ground-truth-data)
+* [Run bulk evaluation](#run-bulk-evaluation)
+* [Review the evaluation results](#review-the-evaluation-results)
+* [Run bulk evaluation on a PR](#run-bulk-evaluation-on-a-pr)
+
+## Deploy a GPT-4 model
+
+
+1. Run this command to tell `azd` to deploy a GPT-4 level model for evaluation:
+
+    ```shell
+    azd env set USE_EVAL true
+    ```
+
+2. Set the capacity to the highest possible value to ensure that the evaluation runs quickly.
+
+    ```shell
+    azd env set AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY 100
+    ```
+
+    By default, that will provision a `gpt-4o` model, version `2024-08-06`. To change those settings, set the azd environment variables `AZURE_OPENAI_EVAL_MODEL` and `AZURE_OPENAI_EVAL_MODEL_VERSION` to the desired values.
+
+3. Then, run the following command to provision the model:
+
+    ```shell
+    azd provision
+    ```
+
+## Setup the evaluation environment
+
+Install all the dependencies for the evaluation script by running the following command:
+
+```bash
+pip install -r evals/requirements.txt
+```
+
+## Generate ground truth data
+
+Modify the search terms and tasks in `evals/generate_config.json` to match your domain.
+
+Generate ground truth data by running the following command:
+
+```bash
+python evals/generate_ground_truth_data.py
+```
+
+Review the generated data in `evals/ground_truth.jsonl` after running that script, removing any question/answer pairs that don't seem like realistic user input.
+
+## Run bulk evaluation
+
+Review the configuration in `evals/eval_config.json` to ensure that everything is correctly setup. You may want to adjust the metrics used. See [the ai-rag-chat-evaluator README](https://github.com/Azure-Samples/ai-rag-chat-evaluator) for more information on the available metrics.
+
+By default, the evaluation script will evaluate every question in the ground truth data.
+Run the evaluation script by running the following command:
+
+```bash
+python evals/evaluate.py
+```
+
+## Review the evaluation results
+
+The evaluation script will output a summary of the evaluation results, inside the `evals/results` directory.
+
+You can see a summary of results across all evaluation runs by running the following command:
+
+```bash
+python -m evaltools summary evals/results
+```
+
+Compare answers across runs by running the following command:
+
+```bash
+python -m evaltools diff evals/results/baseline/
+```
+
+## Run bulk evaluation on a PR
+
+To run the evaluation on the changes in a PR, you can add a `/evaluate` comment to the PR. This will trigger the evaluation workflow to run the evaluation on the PR changes and will post the results to the PR.
@@ -0,0 +1,58 @@
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from azure.identity import AzureDeveloperCliCredential
+from dotenv_azd import load_azd_env
+from evaltools.eval.evaluate import run_evaluate_from_config
+from rich.logging import RichHandler
+
+logger = logging.getLogger("ragapp")
+
+
+def get_openai_config():
+    azure_endpoint = f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com"
+    azure_deployment = os.environ["AZURE_OPENAI_EVAL_DEPLOYMENT"]
+    openai_config = {"azure_endpoint": azure_endpoint, "azure_deployment": azure_deployment}
+    # azure-ai-evaluate will call DefaultAzureCredential behind the scenes,
+    # so we must be logged in to Azure CLI with the correct tenant
+    return openai_config
+
+
+def get_azure_credential():
+    AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
+    if AZURE_TENANT_ID:
+        logger.info("Setting up Azure credential using AzureDeveloperCliCredential with tenant_id %s", AZURE_TENANT_ID)
+        azure_credential = AzureDeveloperCliCredential(tenant_id=AZURE_TENANT_ID, process_timeout=60)
+    else:
+        logger.info("Setting up Azure credential using AzureDeveloperCliCredential for home tenant")
+        azure_credential = AzureDeveloperCliCredential(process_timeout=60)
+    return azure_credential
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]
+    )
+    load_azd_env()
+
+    parser = argparse.ArgumentParser(description="Run evaluation with OpenAI configuration.")
+    parser.add_argument("--targeturl", type=str, help="Specify the target URL.")
+    parser.add_argument("--resultsdir", type=Path, help="Specify the results directory.")
+    parser.add_argument("--numquestions", type=int, help="Specify the number of questions.")
+
+    args = parser.parse_args()
+
+    openai_config = get_openai_config()
+
+    run_evaluate_from_config(
+        working_dir=Path(__file__).parent,
+        config_path="evaluate_config.json",
+        num_questions=args.numquestions,
+        target_url=args.targeturl,
+        results_dir=args.resultsdir,
+        openai_config=openai_config,
+        model=os.environ["AZURE_OPENAI_EVAL_MODEL"],
+        azure_credential=get_azure_credential(),
+    )
@@ -0,0 +1,28 @@
+{
+    "testdata_path": "ground_truth.jsonl",
+    "results_dir": "results/experiment<TIMESTAMP>",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
+    "target_url": "http://localhost:50505/chat",
+    "target_parameters": {
+        "overrides": {
+            "top": 3,
+            "temperature": 0.3,
+            "minimum_reranker_score": 0,
+            "minimum_search_score": 0,
+            "retrieval_mode": "hybrid",
+            "semantic_ranker": true,
+            "semantic_captions": false,
+            "suggest_followup_questions": false,
+            "use_oid_security_filter": false,
+            "use_groups_security_filter": false,
+            "vector_fields": [
+                "embedding"
+            ],
+            "use_gpt4v": false,
+            "gpt4v_input": "textAndImages",
+            "seed": 1
+        }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points.text"
+}
@@ -1,33 +1,37 @@
 {
-    "num_per_task": 2,
-    "simulations": [
-        {"search_term": "benefits",
-         "tasks": [
-            "I am a new employee and I want to learn about benefits.",
-            "I am a new employee and I want to enroll in benefits.",
-            "I am a new parent and I want to learn about benefits."
-         ]
-        },
-        {"search_term": "policies",
-        "tasks": [
-            "I am a new employee and I want to learn about policies.",
-            "I am a new employee and I want to learn about the dress code.",
-            "I am a new employee and I want to learn about the vacation policy."
-        ]
-        },
-        {"search_term": "payroll",
-        "tasks": [
-            "I am a new employee and I want to learn about payroll.",
-            "I am a new employee and I want to learn about direct deposit.",
-            "I am a new employee and I want to learn about pay stubs."
-        ]
-        },
-        {"search_term": "careers",
-        "tasks": [
-            "I am a new employee and I want to learn about career opportunities.",
-            "I am a new employee and I want to learn about the promotion process.",
-            "I am a new employee and I want to learn about the training program."
-        ]
-        }
-]
+  "num_per_task": 2,
+  "simulations": [
+    {
+      "search_term": "benefits",
+      "tasks": [
+        "I am a new employee and I want to learn about benefits.",
+        "I am a new employee and I want to enroll in benefits.",
+        "I am a new parent and I want to learn about benefits."
+      ]
+    },
+    {
+      "search_term": "policies",
+      "tasks": [
+        "I am a new employee and I want to learn about policies.",
+        "I am a new employee and I want to learn about the dress code.",
+        "I am a new employee and I want to learn about the vacation policy."
+      ]
+    },
+    {
+      "search_term": "payroll",
+      "tasks": [
+        "I am a new employee and I want to learn about payroll.",
+        "I am a new employee and I want to learn about direct deposit.",
+        "I am a new employee and I want to learn about pay stubs."
+      ]
+    },
+    {
+      "search_term": "careers",
+      "tasks": [
+        "I am a new employee and I want to learn about career opportunities.",
+        "I am a new employee and I want to learn about the promotion process.",
+        "I am a new employee and I want to learn about the training program."
+      ]
+    }
+  ]
 }
@@ -7,13 +7,12 @@
 
 import requests
 from azure.ai.evaluation.simulator import Simulator
-from azure.identity import AzureDeveloperCliCredential, get_bearer_token_provider
+from azure.identity import AzureDeveloperCliCredential
 from azure.search.documents import SearchClient
 from azure.search.documents.models import (
     QueryType,
 )
 from dotenv_azd import load_azd_env
-from openai import AzureOpenAI
 
 logger = logging.getLogger("evals")
 
@@ -72,17 +71,6 @@ def get_simulator() -> Simulator:
     return simulator
 
 
-def get_openai_client():
-    azure_credential = get_azure_credential()
-    token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default")
-    openai_client = AzureOpenAI(
-        api_version=os.getenv("AZURE_OPENAI_API_VERSION") or "2024-06-01",
-        azure_endpoint=f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com",
-        azure_ad_token_provider=token_provider,
-    )
-    return openai_client
-
-
 def get_azure_credential():
     AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
     if AZURE_TENANT_ID:
@@ -135,7 +123,7 @@ async def generate_ground_truth(azure_credential, simulations: list[dict], num_p
         qa_pairs = []
         for output in outputs:
             qa_pairs.append({"question": output["messages"][0]["content"], "truth": output["messages"][1]["content"]})
-        with open(CURRENT_DIR / "ground_truth_singleturn.jsonl", "a") as f:
+        with open(CURRENT_DIR / "ground_truth.jsonl", "a") as f:
             for qa_pair in qa_pairs:
                 f.write(json.dumps(qa_pair) + "\n")
 
 
@@ -1,3 +1,4 @@
 dotenv-azd==0.2.0
 azure-ai-evaluation==1.0.1
 rich
+git+https://github.com/Azure-Samples/ai-rag-chat-evaluator
@@ -0,0 +1,28 @@
+{
+    "testdata_path": "ground_truth.jsonl",
+    "results_dir": "results/experiment<TIMESTAMP>",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
+    "target_url": "http://localhost:50505/chat",
+    "target_parameters": {
+        "overrides": {
+            "top": 3,
+            "temperature": 0.3,
+            "minimum_reranker_score": 0,
+            "minimum_search_score": 0,
+            "retrieval_mode": "hybrid",
+            "semantic_ranker": true,
+            "semantic_captions": false,
+            "suggest_followup_questions": false,
+            "use_oid_security_filter": false,
+            "use_groups_security_filter": false,
+            "vector_fields": [
+                "embedding"
+            ],
+            "use_gpt4v": false,
+            "gpt4v_input": "textAndImages",
+            "seed": 1
+        }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points.text"
+}