Skip to content

Commit d593dc2

Browse files
committed
Add evaluation workflow
1 parent a7dfc64 commit d593dc2

File tree

8 files changed

+405
-4
lines changed

8 files changed

+405
-4
lines changed

.github/workflows/evaluate.yaml

Lines changed: 227 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,227 @@
1+
name: Evaluate RAG answer flow
2+
3+
on:
4+
issue_comment:
5+
types: [created]
6+
7+
# Set up permissions for deploying with secretless Azure federated credentials
8+
# https://learn.microsoft.com/azure/developer/github/connect-from-azure?tabs=azure-portal%2Clinux#set-up-azure-login-with-openid-connect-authentication
9+
permissions:
10+
id-token: write
11+
contents: read
12+
issues: write
13+
pull-requests: write
14+
15+
jobs:
16+
evaluate:
17+
if: |
18+
contains('["OWNER", "CONTRIBUTOR", "COLLABORATOR", "MEMBER"]', github.event.comment.author_association) &&
19+
github.event.issue.pull_request &&
20+
github.event.comment.body == '/evaluate'
21+
runs-on: ubuntu-latest
22+
env:
23+
# azd required
24+
AZURE_CLIENT_ID: ${{ vars.AZURE_CLIENT_ID }}
25+
AZURE_TENANT_ID: ${{ vars.AZURE_TENANT_ID }}
26+
AZURE_SUBSCRIPTION_ID: ${{ vars.AZURE_SUBSCRIPTION_ID }}
27+
AZURE_ENV_NAME: ${{ vars.AZURE_ENV_NAME }}
28+
AZURE_LOCATION: ${{ vars.AZURE_LOCATION }}
29+
# project specific
30+
AZURE_OPENAI_SERVICE: ${{ vars.AZURE_OPENAI_SERVICE }}
31+
AZURE_OPENAI_LOCATION: ${{ vars.AZURE_OPENAI_LOCATION }}
32+
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
33+
AZURE_OPENAI_RESOURCE_GROUP: ${{ vars.AZURE_OPENAI_RESOURCE_GROUP }}
34+
AZURE_DOCUMENTINTELLIGENCE_SERVICE: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SERVICE }}
35+
AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP }}
36+
AZURE_DOCUMENTINTELLIGENCE_SKU: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_SKU }}
37+
AZURE_DOCUMENTINTELLIGENCE_LOCATION: ${{ vars.AZURE_DOCUMENTINTELLIGENCE_LOCATION }}
38+
AZURE_COMPUTER_VISION_SERVICE: ${{ vars.AZURE_COMPUTER_VISION_SERVICE }}
39+
AZURE_COMPUTER_VISION_RESOURCE_GROUP: ${{ vars.AZURE_COMPUTER_VISION_RESOURCE_GROUP }}
40+
AZURE_COMPUTER_VISION_LOCATION: ${{ vars.AZURE_COMPUTER_VISION_LOCATION }}
41+
AZURE_COMPUTER_VISION_SKU: ${{ vars.AZURE_COMPUTER_VISION_SKU }}
42+
AZURE_SEARCH_INDEX: ${{ vars.AZURE_SEARCH_INDEX }}
43+
AZURE_SEARCH_SERVICE: ${{ vars.AZURE_SEARCH_SERVICE }}
44+
AZURE_SEARCH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SEARCH_SERVICE_RESOURCE_GROUP }}
45+
AZURE_SEARCH_SERVICE_LOCATION: ${{ vars.AZURE_SEARCH_SERVICE_LOCATION }}
46+
AZURE_SEARCH_SERVICE_SKU: ${{ vars.AZURE_SEARCH_SERVICE_SKU }}
47+
AZURE_SEARCH_QUERY_LANGUAGE: ${{ vars.AZURE_SEARCH_QUERY_LANGUAGE }}
48+
AZURE_SEARCH_QUERY_SPELLER: ${{ vars.AZURE_SEARCH_QUERY_SPELLER }}
49+
AZURE_SEARCH_SEMANTIC_RANKER: ${{ vars.AZURE_SEARCH_SEMANTIC_RANKER }}
50+
AZURE_STORAGE_ACCOUNT: ${{ vars.AZURE_STORAGE_ACCOUNT }}
51+
AZURE_STORAGE_RESOURCE_GROUP: ${{ vars.AZURE_STORAGE_RESOURCE_GROUP }}
52+
AZURE_STORAGE_SKU: ${{ vars.AZURE_STORAGE_SKU }}
53+
AZURE_APP_SERVICE_PLAN: ${{ vars.AZURE_APP_SERVICE_PLAN }}
54+
AZURE_APP_SERVICE_SKU: ${{ vars.AZURE_APP_SERVICE_SKU }}
55+
AZURE_APP_SERVICE: ${{ vars.AZURE_APP_SERVICE }}
56+
AZURE_OPENAI_CHATGPT_MODEL: ${{ vars.AZURE_OPENAI_CHATGPT_MODEL }}
57+
AZURE_OPENAI_CHATGPT_DEPLOYMENT: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT }}
58+
AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_CAPACITY }}
59+
AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_CHATGPT_DEPLOYMENT_VERSION }}
60+
AZURE_OPENAI_EMB_MODEL_NAME: ${{ vars.AZURE_OPENAI_EMB_MODEL_NAME }}
61+
AZURE_OPENAI_EMB_DEPLOYMENT: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT }}
62+
AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_CAPACITY }}
63+
AZURE_OPENAI_EMB_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_EMB_DEPLOYMENT_VERSION }}
64+
AZURE_OPENAI_EMB_DIMENSIONS: ${{ vars.AZURE_OPENAI_EMB_DIMENSIONS }}
65+
AZURE_OPENAI_GPT4V_MODEL: ${{ vars.AZURE_OPENAI_GPT4V_MODEL }}
66+
AZURE_OPENAI_GPT4V_DEPLOYMENT: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT }}
67+
AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_CAPACITY }}
68+
AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_VERSION }}
69+
AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU: ${{ vars.AZURE_OPENAI_GPT4V_DEPLOYMENT_SKU }}
70+
AZURE_OPENAI_DISABLE_KEYS: ${{ vars.AZURE_OPENAI_DISABLE_KEYS }}
71+
OPENAI_HOST: ${{ vars.OPENAI_HOST }}
72+
OPENAI_API_KEY: ${{ vars.OPENAI_API_KEY }}
73+
OPENAI_ORGANIZATION: ${{ vars.OPENAI_ORGANIZATION }}
74+
AZURE_USE_APPLICATION_INSIGHTS: ${{ vars.AZURE_USE_APPLICATION_INSIGHTS }}
75+
AZURE_APPLICATION_INSIGHTS: ${{ vars.AZURE_APPLICATION_INSIGHTS }}
76+
AZURE_APPLICATION_INSIGHTS_DASHBOARD: ${{ vars.AZURE_APPLICATION_INSIGHTS_DASHBOARD }}
77+
AZURE_LOG_ANALYTICS: ${{ vars.AZURE_LOG_ANALYTICS }}
78+
USE_VECTORS: ${{ vars.USE_VECTORS }}
79+
USE_GPT4V: ${{ vars.USE_GPT4V }}
80+
AZURE_VISION_ENDPOINT: ${{ vars.AZURE_VISION_ENDPOINT }}
81+
VISION_SECRET_NAME: ${{ vars.VISION_SECRET_NAME }}
82+
ENABLE_LANGUAGE_PICKER: ${{ vars.ENABLE_LANGUAGE_PICKER }}
83+
USE_SPEECH_INPUT_BROWSER: ${{ vars.USE_SPEECH_INPUT_BROWSER }}
84+
USE_SPEECH_OUTPUT_BROWSER: ${{ vars.USE_SPEECH_OUTPUT_BROWSER }}
85+
USE_SPEECH_OUTPUT_AZURE: ${{ vars.USE_SPEECH_OUTPUT_AZURE }}
86+
AZURE_SPEECH_SERVICE: ${{ vars.AZURE_SPEECH_SERVICE }}
87+
AZURE_SPEECH_SERVICE_RESOURCE_GROUP: ${{ vars.AZURE_SPEECH_RESOURCE_GROUP }}
88+
AZURE_SPEECH_SERVICE_LOCATION: ${{ vars.AZURE_SPEECH_SERVICE_LOCATION }}
89+
AZURE_SPEECH_SERVICE_SKU: ${{ vars.AZURE_SPEECH_SERVICE_SKU }}
90+
AZURE_SPEECH_SERVICE_VOICE: ${{ vars.AZURE_SPEECH_SERVICE_VOICE }}
91+
AZURE_KEY_VAULT_NAME: ${{ vars.AZURE_KEY_VAULT_NAME }}
92+
AZURE_USE_AUTHENTICATION: ${{ vars.AZURE_USE_AUTHENTICATION }}
93+
AZURE_ENFORCE_ACCESS_CONTROL: ${{ vars.AZURE_ENFORCE_ACCESS_CONTROL }}
94+
AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS: ${{ vars.AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS }}
95+
AZURE_ENABLE_UNAUTHENTICATED_ACCESS: ${{ vars.AZURE_ENABLE_UNAUTHENTICATED_ACCESS }}
96+
AZURE_AUTH_TENANT_ID: ${{ vars.AZURE_AUTH_TENANT_ID }}
97+
AZURE_SERVER_APP_ID: ${{ vars.AZURE_SERVER_APP_ID }}
98+
AZURE_CLIENT_APP_ID: ${{ vars.AZURE_CLIENT_APP_ID }}
99+
ALLOWED_ORIGIN: ${{ vars.ALLOWED_ORIGIN }}
100+
AZURE_ADLS_GEN2_STORAGE_ACCOUNT: ${{ vars.AZURE_ADLS_GEN2_STORAGE_ACCOUNT }}
101+
AZURE_ADLS_GEN2_FILESYSTEM_PATH: ${{ vars.AZURE_ADLS_GEN2_FILESYSTEM_PATH }}
102+
AZURE_ADLS_GEN2_FILESYSTEM: ${{ vars.AZURE_ADLS_GEN2_FILESYSTEM }}
103+
DEPLOYMENT_TARGET: ${{ vars.DEPLOYMENT_TARGET }}
104+
AZURE_CONTAINER_APPS_WORKLOAD_PROFILE: ${{ vars.AZURE_CONTAINER_APPS_WORKLOAD_PROFILE }}
105+
USE_CHAT_HISTORY_BROWSER: ${{ vars.USE_CHAT_HISTORY_BROWSER }}
106+
USE_MEDIA_DESCRIBER_AZURE_CU: ${{ vars.USE_MEDIA_DESCRIBER_AZURE_CU }}
107+
steps:
108+
109+
- name: Comment on pull request
110+
uses: actions/github-script@v7
111+
with:
112+
script: |
113+
github.rest.issues.createComment({
114+
issue_number: context.issue.number,
115+
owner: context.repo.owner,
116+
repo: context.repo.repo,
117+
body: "Starting evaluation! Check the Actions tab for progress, or wait for a comment with the results."
118+
})
119+
120+
- name: Checkout pull request
121+
uses: actions/checkout@v4
122+
with:
123+
ref: refs/pull/${{ github.event.issue.number }}/head
124+
125+
- name: Install uv
126+
uses: astral-sh/setup-uv@v5
127+
with:
128+
enable-cache: true
129+
version: "0.4.20"
130+
cache-dependency-glob: "requirements**.txt"
131+
132+
- name: Setup node
133+
uses: actions/setup-node@v4
134+
with:
135+
node-version: 18
136+
137+
- name: Install azd
138+
uses: Azure/[email protected]
139+
140+
- name: Login to Azure with az CLI
141+
uses: azure/login@v2
142+
with:
143+
client-id: ${{ env.AZURE_CLIENT_ID }}
144+
tenant-id: ${{ env.AZURE_TENANT_ID }}
145+
subscription-id: ${{ env.AZURE_SUBSCRIPTION_ID }}
146+
147+
- name: Set az account
148+
uses: azure/CLI@v2
149+
with:
150+
inlineScript: |
151+
az account set --subscription ${{env.AZURE_SUBSCRIPTION_ID}}
152+
153+
- name: Login to with Azure with azd (Federated Credentials)
154+
if: ${{ env.AZURE_CLIENT_ID != '' }}
155+
run: |
156+
azd auth login `
157+
--client-id "$Env:AZURE_CLIENT_ID" `
158+
--federated-credential-provider "github" `
159+
--tenant-id "$Env:AZURE_TENANT_ID"
160+
shell: pwsh
161+
162+
- name: Build frontend
163+
run: |
164+
cd ./app/frontend
165+
npm install
166+
npm run build
167+
168+
- name: Install dependencies
169+
run: |
170+
uv pip install -r requirements-dev.txt
171+
172+
- name: Run local server in background
173+
run: |
174+
RUNNER_TRACKING_ID="" && (nohup python3 -m uvicorn fastapi_app:create_app --factory > serverlogs.out 2> serverlogs.err &)
175+
176+
- name: Install evaluate dependencies
177+
run: |
178+
uv pip install -r evals/requirements.txt
179+
180+
- name: Evaluate local RAG flow
181+
run: |
182+
python evals/evaluate.py --targeturl=http://127.0.0.1:8000/chat --resultsdir=evals/results/pr${{ github.event.issue.number }}
183+
184+
- name: Upload server logs as build artifact
185+
uses: actions/upload-artifact@v4
186+
with:
187+
name: server_logs
188+
path: ./serverlogs.out
189+
190+
- name: Upload server error logs as build artifact
191+
uses: actions/upload-artifact@v4
192+
with:
193+
name: server_error_logs
194+
path: ./serverlogs.err
195+
196+
- name: Upload eval results as build artifact
197+
uses: actions/upload-artifact@v4
198+
with:
199+
name: eval_result
200+
path: ./evals/results/pr${{ github.event.issue.number }}
201+
202+
- name: Summarize results
203+
if: ${{ success() }}
204+
run: |
205+
echo "## Evaluation results" >> eval-summary.md
206+
python -m evaltools summary evals/results --output=markdown >> eval-summary.md
207+
echo "## Answer differences across runs" >> run-diff.md
208+
python -m evaltools diff evals/results/baseline evals/results/pr${{ github.event.issue.number }} --output=markdown >> run-diff.md
209+
cat eval-summary.md >> $GITHUB_STEP_SUMMARY
210+
cat run-diff.md >> $GITHUB_STEP_SUMMARY
211+
212+
- name: Comment on pull request
213+
uses: actions/github-script@v7
214+
with:
215+
script: |
216+
const fs = require('fs');
217+
const summaryPath = "eval-summary.md";
218+
const summary = fs.readFileSync(summaryPath, 'utf8');
219+
const runId = process.env.GITHUB_RUN_ID;
220+
const repo = process.env.GITHUB_REPOSITORY;
221+
const actionsUrl = `https://github.com/${repo}/actions/runs/${runId}`;
222+
github.rest.issues.createComment({
223+
issue_number: context.issue.number,
224+
owner: context.repo.owner,
225+
repo: context.repo.repo,
226+
body: `${summary}\n\n[Check the workflow run for more details](${actionsUrl}).`
227+
})

docs/evaluation.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,13 @@ Run the evaluation script by running the following command:
8181
python evals/evaluate.py
8282
```
8383
84-
🕰️ This may take a long time, possibly several hours, depending on the number of ground truth questions. You can specify `--numquestions` argument for a test run on a subset of the questions.
84+
The options are:
85+
86+
* `numquestions`: The number of questions to evaluate. By default, this is all questions in the ground truth data.
87+
* `resultsdir`: The directory to write the evaluation results. By default, this is a timestamped folder in `evals/results`. This option can also be specified in `eval_config.json`.
88+
* `targeturl`: The URL of the running application to evaluate. By default, this is `http://localhost:50505`. This option can also be specified in `eval_config.json`.
89+
90+
🕰️ This may take a long time, possibly several hours, depending on the number of ground truth questions.
8591
8692
## Review the evaluation results
8793
@@ -93,12 +99,18 @@ You can see a summary of results across all evaluation runs by running the follo
9399
python -m evaltools summary evals/results
94100
```
95101
96-
Compare answers across runs by running the following command:
102+
Compare answers to the ground truth by running the following command:
97103
98104
```bash
99105
python -m evaltools diff evals/results/baseline/
100106
```
101107
108+
Compare answers across two runs by running the following command:
109+
110+
```bash
111+
python -m evaltools diff evals/results/baseline/ evals/results/SECONDRUNHERE
112+
```
113+
102114
## Run bulk evaluation on a PR
103115
104116
To run the evaluation on the changes in a PR, you can add a `/evaluate` comment to the PR. This will trigger the evaluation workflow to run the evaluation on the PR changes and will post the results to the PR.

evals/evaluate.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,28 @@
1414
logger = logging.getLogger("ragapp")
1515

1616

17+
class AnyCitationMetric(BaseMetric):
18+
METRIC_NAME = "any_citation"
19+
20+
@classmethod
21+
def evaluator_fn(cls, **kwargs):
22+
def any_citation(*, response, **kwargs):
23+
if response is None:
24+
logger.warning("Received response of None, can't compute any_citation metric. Setting to -1.")
25+
return {cls.METRIC_NAME: -1}
26+
return {cls.METRIC_NAME: bool(re.search(r"\[([^\]]+)\.\w{3,4}(#page=\d+)*\]", response))}
27+
28+
return any_citation
29+
30+
@classmethod
31+
def get_aggregate_stats(cls, df):
32+
df = df[df[cls.METRIC_NAME] != -1]
33+
return {
34+
"total": int(df[cls.METRIC_NAME].sum()),
35+
"rate": round(df[cls.METRIC_NAME].mean(), 2),
36+
}
37+
38+
1739
class CitationsMatchedMetric(BaseMetric):
1840
METRIC_NAME = "citations_matched"
1941

@@ -80,6 +102,8 @@ def get_azure_credential():
80102
openai_config = get_openai_config()
81103

82104
register_metric(CitationsMatchedMetric)
105+
register_metric(AnyCitationMetric)
106+
83107
run_evaluate_from_config(
84108
working_dir=Path(__file__).parent,
85109
config_path="evaluate_config.json",

evals/evaluate_config.json

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
{
22
"testdata_path": "ground_truth.jsonl",
3-
"results_dir": "results/experiment<TIMESTAMP>",
4-
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched"],
3+
"results_dir": "results/gpt-4o-mini",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
55
"target_url": "http://localhost:50505/chat",
66
"target_parameters": {
77
"overrides": {
Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
{
2+
"testdata_path": "ground_truth.jsonl",
3+
"results_dir": "results/gpt-4o-mini",
4+
"requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citations_matched", "any_citation"],
5+
"target_url": "http://localhost:50505/chat",
6+
"target_parameters": {
7+
"overrides": {
8+
"top": 3,
9+
"temperature": 0.3,
10+
"minimum_reranker_score": 0,
11+
"minimum_search_score": 0,
12+
"retrieval_mode": "hybrid",
13+
"semantic_ranker": true,
14+
"semantic_captions": false,
15+
"suggest_followup_questions": false,
16+
"use_oid_security_filter": false,
17+
"use_groups_security_filter": false,
18+
"vector_fields": [
19+
"embedding"
20+
],
21+
"use_gpt4v": false,
22+
"gpt4v_input": "textAndImages",
23+
"seed": 1
24+
}
25+
},
26+
"target_response_answer_jmespath": "message.content",
27+
"target_response_context_jmespath": "context.data_points.text"
28+
}

0 commit comments

Comments
 (0)