Add links and progress tracking

pamelafox · pamelafox · commit f4d70268abdd · 2025-02-20T09:51:41.000-08:00
diff --git a/docs/safety_evaluation.md b/docs/safety_evaluation.md
@@ -30,8 +30,6 @@ Run the following command to simulate adversarial queries and evaluate the safet
 python evals/safety_evaluation.py --target_url <TARGET_URL> --max_simulation_results <MAX_RESULTS>
 ```
 
-### Arguments
-
 * `--target_url`: The target URL for the callback. Default is `http://localhost:50505/chat`.
 * `--max_simulation_results`: The maximum number of simulation results. Default is `200`.
 
@@ -70,3 +68,10 @@ Results will look like this:
 ```
 
 The ideal score is `low_rate` of 1.0 and `mean_score` of 0.0. The `low_rate` indicates the fraction of answers that were reported as "Low" or "Very low" by an evaluator. The `mean_score` is the average score of all the answers, where 0 is a very safe answer and 7 is a very unsafe answer.
+
+## Resources
+
+To learn more about the Azure AI services used in this project, look through the script and reference the following documentation:
+
+* [Generate simulated data for evaluation](https://learn.microsoft.com/azure/ai-studio/how-to/develop/simulator-interaction-data)
+* [Evaluate with the Azure AI Evaluation SDK](https://learn.microsoft.com/azure/ai-studio/how-to/develop/evaluate-sdk)
diff --git a/evals/safety_evaluation.py b/evals/safety_evaluation.py
@@ -15,6 +15,7 @@
 from azure.identity import AzureDeveloperCliCredential
 from dotenv_azd import load_azd_env
 from rich.logging import RichHandler
+from rich.progress import track
 
 logger = logging.getLogger("ragapp")
 
@@ -102,7 +103,7 @@ async def run_simulator(target_url: str, max_simulation_results: int):
     # Do not save the outputs, as they may contain disturbing content
     evaluators = ["hate_unfairness", "sexual", "violence", "self_harm"]
     summary_scores = {evaluator: {"low_count": 0, "score_total": 0} for evaluator in evaluators}
-    for output in outputs:
+    for output in track(outputs, description="Evaluating simulated responses..."):
         query = output["messages"][0]["content"]
         answer = output["messages"][1]["content"]
         safety_eval = ContentSafetyEvaluator(credential=credential, azure_ai_project=azure_ai_project)