Azure-Samples · pamelafox · Feb 10, 2025 · Dec 11, 2024 · Dec 11, 2024 · Dec 11, 2024
diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt
diff --git a/docs/evaluation.md b/docs/evaluation.md
@@ -0,0 +1,84 @@
+# Evaluating the RAG answer quality
+
+Follow these steps to evaluate the quality of the answers generated by the RAG flow.
+
+* [Deploy a GPT-4 model](#deploy-a-gpt-4-model)
+* [Setup the evaluation environment](#setup-the-evaluation-environment)
+* [Generate ground truth data](#generate-ground-truth-data)
+* [Run bulk evaluation](#run-bulk-evaluation)
+* [Review the evaluation results](#review-the-evaluation-results)
+* [Run bulk evaluation on a PR](#run-bulk-evaluation-on-a-pr)
+
+## Deploy a GPT-4 model
+
+
+1. Run this command to tell `azd` to deploy a GPT-4 level model for evaluation:
+
+    ```shell
+    azd env set USE_EVAL true
+    ```
+
+2. Set the capacity to the highest possible value to ensure that the evaluation runs quickly.
+
+    ```shell
+    azd env set AZURE_OPENAI_EVAL_DEPLOYMENT_CAPACITY 100
+    ```
+
+    By default, that will provision a `gpt-4o` model, version `2024-08-06`. To change those settings, set the azd environment variables `AZURE_OPENAI_EVAL_MODEL` and `AZURE_OPENAI_EVAL_MODEL_VERSION` to the desired values.
+
+3. Then, run the following command to provision the model:
+
+    ```shell
+    azd provision
+    ```
+
+## Setup the evaluation environment
+
+Install all the dependencies for the evaluation script by running the following command:
+
+```bash
+pip install -r evals/requirements.txt
+```
+
+## Generate ground truth data
+
+Modify the search terms and tasks in `evals/generate_config.json` to match your domain.
+
+Generate ground truth data by running the following command:
+
+```bash
+python evals/generate_ground_truth_data.py
+```
+
+Review the generated data in `evals/ground_truth.jsonl` after running that script, removing any question/answer pairs that don't seem like realistic user input.
+
+## Run bulk evaluation
+
+Review the configuration in `evals/eval_config.json` to ensure that everything is correctly setup. You may want to adjust the metrics used. See [the ai-rag-chat-evaluator README](https://github.com/Azure-Samples/ai-rag-chat-evaluator) for more information on the available metrics.
+
+By default, the evaluation script will evaluate every question in the ground truth data.
+Run the evaluation script by running the following command:
+
+```bash
+python evals/evaluate.py
+```
+
+## Review the evaluation results
+
+The evaluation script will output a summary of the evaluation results, inside the `evals/results` directory.
+
+You can see a summary of results across all evaluation runs by running the following command:
+
+```bash
+python -m evaltools summary evals/results
+```
+
+Compare answers across runs by running the following command:
+
+```bash
+python -m evaltools diff evals/results/baseline/
+```
+
+## Run bulk evaluation on a PR
+
+To run the evaluation on the changes in a PR, you can add a `/evaluate` comment to the PR. This will trigger the evaluation workflow to run the evaluation on the PR changes and will post the results to the PR.
diff --git a/evals/evaluate.py b/evals/evaluate.py
@@ -0,0 +1,58 @@
+import argparse
+import logging
+import os
+from pathlib import Path
+
+from azure.identity import AzureDeveloperCliCredential
+from dotenv_azd import load_azd_env
+from evaltools.eval.evaluate import run_evaluate_from_config
+from rich.logging import RichHandler
+
+logger = logging.getLogger("ragapp")
+
+
+def get_openai_config():
+    azure_endpoint = f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com"
+    azure_deployment = os.environ["AZURE_OPENAI_EVAL_DEPLOYMENT"]
+    openai_config = {"azure_endpoint": azure_endpoint, "azure_deployment": azure_deployment}
+    # azure-ai-evaluate will call DefaultAzureCredential behind the scenes,
+    # so we must be logged in to Azure CLI with the correct tenant
+    return openai_config
+
+
+def get_azure_credential():
+    AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
+    if AZURE_TENANT_ID:
+        logger.info("Setting up Azure credential using AzureDeveloperCliCredential with tenant_id %s", AZURE_TENANT_ID)
+        azure_credential = AzureDeveloperCliCredential(tenant_id=AZURE_TENANT_ID, process_timeout=60)
+    else:
+        logger.info("Setting up Azure credential using AzureDeveloperCliCredential for home tenant")
+        azure_credential = AzureDeveloperCliCredential(process_timeout=60)
+    return azure_credential
+
+
+if __name__ == "__main__":
+    logging.basicConfig(
+        level=logging.INFO, format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]
+    )
+    load_azd_env()
+
+    parser = argparse.ArgumentParser(description="Run evaluation with OpenAI configuration.")
+    parser.add_argument("--targeturl", type=str, help="Specify the target URL.")
+    parser.add_argument("--resultsdir", type=Path, help="Specify the results directory.")
+    parser.add_argument("--numquestions", type=int, help="Specify the number of questions.")
+
+    args = parser.parse_args()
+
+    openai_config = get_openai_config()
+
+    run_evaluate_from_config(
+        working_dir=Path(__file__).parent,
+        config_path="evaluate_config.json",
+        num_questions=args.numquestions,
+        target_url=args.targeturl,
+        results_dir=args.resultsdir,
+        openai_config=openai_config,
+        model=os.environ["AZURE_OPENAI_EVAL_MODEL"],
+        azure_credential=get_azure_credential(),
+    )
diff --git a/evals/evaluate_config.json b/evals/evaluate_config.json
@@ -0,0 +1,28 @@
+{
+    "testdata_path": "ground_truth.jsonl",
+    "results_dir": "results/experiment<TIMESTAMP>",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "gpt_coherence", "answer_length", "latency"],
+    "target_url": "http://localhost:50505/chat",
+    "target_parameters": {
+        "overrides": {
+            "top": 3,
+            "temperature": 0.3,
+            "minimum_reranker_score": 0,
+            "minimum_search_score": 0,
+            "retrieval_mode": "hybrid",
+            "semantic_ranker": true,
+            "semantic_captions": false,
+            "suggest_followup_questions": false,
+            "use_oid_security_filter": false,
+            "use_groups_security_filter": false,
+            "vector_fields": [
+                "embedding"
+            ],
+            "use_gpt4v": false,
+            "gpt4v_input": "textAndImages",
+            "seed": 1
+        }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points.text"
+}
diff --git a/evals/generate_config.json b/evals/generate_config.json
@@ -0,0 +1,37 @@
+{
+  "num_per_task": 2,
+  "simulations": [
+    {
+      "search_term": "benefits",
+      "tasks": [
+        "I am a new employee and I want to learn about benefits.",
+        "I am a new employee and I want to enroll in benefits.",
+        "I am a new parent and I want to learn about benefits."
+      ]
+    },
+    {
+      "search_term": "policies",
+      "tasks": [
+        "I am a new employee and I want to learn about policies.",
+        "I am a new employee and I want to learn about the dress code.",
+        "I am a new employee and I want to learn about the vacation policy."
+      ]
+    },
+    {
+      "search_term": "payroll",
+      "tasks": [
+        "I am a new employee and I want to learn about payroll.",
+        "I am a new employee and I want to learn about direct deposit.",
+        "I am a new employee and I want to learn about pay stubs."
+      ]
+    },
+    {
+      "search_term": "careers",
+      "tasks": [
+        "I am a new employee and I want to learn about career opportunities.",
+        "I am a new employee and I want to learn about the promotion process.",
+        "I am a new employee and I want to learn about the training program."
+      ]
+    }
+  ]
+}
diff --git a/evals/generate_ground_truth.py b/evals/generate_ground_truth.py
@@ -0,0 +1,143 @@
+import asyncio
+import json
+import logging
+import os
+from pathlib import Path
+from typing import Any, Dict, List, Optional
+
+import requests
+from azure.ai.evaluation.simulator import Simulator
+from azure.identity import AzureDeveloperCliCredential
+from azure.search.documents import SearchClient
+from azure.search.documents.models import (
+    QueryType,
+)
+from dotenv_azd import load_azd_env
+
+logger = logging.getLogger("evals")
+
+CURRENT_DIR = Path(__file__).parent
+
+
+async def callback(
+    messages: List[Dict],
+    stream: bool = False,
+    session_state: Any = None,
+    context: Optional[Dict[str, Any]] = None,
+):
+    messages_list = messages["messages"]
+    latest_message = messages_list[-1]
+    query = latest_message["content"]
+    headers = {"Content-Type": "application/json"}
+    body = {
+        "messages": [{"content": query, "role": "user"}],
+        "stream": stream,
+        "context": {
+            "overrides": {
+                "top": 3,
+                "temperature": 0.3,
+                "minimum_reranker_score": 0,
+                "minimum_search_score": 0,
+                "retrieval_mode": "hybrid",
+                "semantic_ranker": True,
+                "semantic_captions": False,
+                "suggest_followup_questions": False,
+                "use_oid_security_filter": False,
+                "use_groups_security_filter": False,
+                "vector_fields": ["embedding"],
+                "use_gpt4v": False,
+                "gpt4v_input": "textAndImages",
+                "seed": 1,
+            }
+        },
+    }
+    url = "http://localhost:50505/chat"
+    r = requests.post(url, headers=headers, json=body)
+    response = r.json()
+    response["messages"] = messages_list + [response["message"]]
+    return response
+
+
+def get_simulator() -> Simulator:
+    azure_endpoint = f"https://{os.getenv('AZURE_OPENAI_SERVICE')}.openai.azure.com"
+    # JSON mode supported model preferred to avoid errors ex. gpt-4o-mini, gpt-4o, gpt-4 (1106)
+    azure_deployment = os.getenv("AZURE_OPENAI_EVAL_DEPLOYMENT")
+    model_config = {
+        "azure_endpoint": azure_endpoint,
+        "azure_deployment": azure_deployment,
+    }
+    # Simulator will use DefaultAzureCredential, so make sure your Azure CLI is logged in to correct tenant
+    simulator = Simulator(model_config=model_config)
+    return simulator
+
+
+def get_azure_credential():
+    AZURE_TENANT_ID = os.getenv("AZURE_TENANT_ID")
+    if AZURE_TENANT_ID:
+        logger.info("Setting up Azure credential using AzureDeveloperCliCredential with tenant_id %s", AZURE_TENANT_ID)
+        azure_credential = AzureDeveloperCliCredential(tenant_id=AZURE_TENANT_ID, process_timeout=60)
+    else:
+        logger.info("Setting up Azure credential using AzureDeveloperCliCredential for home tenant")
+        azure_credential = AzureDeveloperCliCredential(process_timeout=60)
+    return azure_credential
+
+
+def generate_text_from_index(azure_credential, search_term: str) -> str:
+    search_client = SearchClient(
+        endpoint=f"https://{os.getenv('AZURE_SEARCH_SERVICE')}.search.windows.net",
+        index_name=os.getenv("AZURE_SEARCH_INDEX"),
+        credential=azure_credential,
+    )
+    query_language = os.getenv("AZURE_SEARCH_QUERY_LANGUAGE", "en-us")
+    query_speller = os.getenv("AZURE_SEARCH_QUERY_SPELLER", "lexicon")
+    search_results = search_client.search(
+        search_text=search_term,
+        top=10,
+        query_type=QueryType.SEMANTIC,
+        query_language=query_language,
+        query_speller=query_speller,
+        semantic_configuration_name="default",
+        semantic_query=search_term,
+    )
+    text = ""
+    for result in search_results:
+        text += result["content"]
+    return text[0:5000]
+
+
+async def generate_ground_truth(azure_credential, simulations: list[dict], num_per_task: int = 2):
+    """
+    Generates single-turn ground truth Q/A pairs for given search term/tasks combos.
+    """
+    simulator = get_simulator()
+    for simulation in simulations:
+        text = generate_text_from_index(azure_credential, simulation["search_term"])
+        tasks = simulation["tasks"]
+        outputs = await simulator(
+            target=callback,
+            text=text,
+            max_conversation_turns=1,
+            num_queries=len(tasks) * num_per_task,
+            tasks=tasks * num_per_task,
+        )
+        qa_pairs = []
+        for output in outputs:
+            qa_pairs.append({"question": output["messages"][0]["content"], "truth": output["messages"][1]["content"]})
+        with open(CURRENT_DIR / "ground_truth.jsonl", "a") as f:
+            for qa_pair in qa_pairs:
+                f.write(json.dumps(qa_pair) + "\n")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.WARNING)
+    logger.setLevel(logging.INFO)
+    load_azd_env()
+
+    azure_credential = get_azure_credential()
+
+    with open(CURRENT_DIR / "generate_config.json") as f:
+        generate_config = json.load(f)
+        simulations = generate_config["simulations"]
+        num_per_task = generate_config.get("num_per_task", 2)
+
+    asyncio.run(generate_ground_truth(azure_credential, simulations, num_per_task))