Skip to content

Commit 49e6683

Browse files
committed
hands on evaluators with prompt flow sdk
1 parent 02c34a1 commit 49e6683

File tree

9 files changed

+275
-6
lines changed

9 files changed

+275
-6
lines changed

.pre-commit-config.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,4 +13,4 @@ repos:
1313
rev: 24.2.0
1414
hooks:
1515
- id: black
16-
exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json'
16+
exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json|.prompty'

apps/11_promptflow/README.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,15 @@ $ pf run create \
273273
$ pf run show-details --name $RUN_NAME
274274
```
275275

276+
### evaluators
277+
278+
To guide you through working with evaluators, a helpful document is available at [Evaluate with the prompt flow SDK](https://learn.microsoft.com/azure/ai-studio/how-to/develop/flow-evaluate-sdk).
279+
280+
```shell
281+
# Show help
282+
python apps/11_promptflow/evaluators/main.py --help
283+
```
284+
276285
<!-- TODO: rag, deployments -->
277286

278287
## References
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
results.json
Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
---
2+
name: Apology Evaluator
3+
description: Apology Evaluator for QA scenario
4+
model:
5+
api: chat
6+
configuration:
7+
type: azure_openai
8+
connection: open_ai_connection
9+
azure_deployment: gpt-4
10+
parameters:
11+
temperature: 0.2
12+
response_format: { "type": "text" }
13+
inputs:
14+
question:
15+
type: string
16+
answer:
17+
type: string
18+
outputs:
19+
apology:
20+
type: int
21+
---
22+
system:
23+
You are an AI tool that determines if, in a chat conversation, the assistant apologized, like say sorry.
24+
Only provide a response of {"apology": 0} or {"apology": 1} so that the output is valid JSON.
25+
Give a apology of 1 if apologized in the chat conversation.
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
{"answer": "Paris is the capital of France.", "context": "France is in Europe", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks.", "question": "What is the capital of France?"}
2+
{"answer": "I'm sorry, I don't know that. Would you like me to look it up for you?", "context": "Fixing car isn't supported.", "ground_truth": "We don't support car fixing service. I'm sorry, I don't know that. Would you like me to look it up for you?", "question": "Where can I get my car fixed?"}
Lines changed: 135 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
import argparse
2+
import logging
3+
from enum import Enum
4+
from os import getenv
5+
from pathlib import Path
6+
7+
from dotenv import load_dotenv
8+
from promptflow.client import load_flow
9+
from promptflow.core import AzureOpenAIModelConfiguration
10+
from promptflow.evals.evaluate import evaluate
11+
from promptflow.evals.evaluators import RelevanceEvaluator
12+
13+
BASE_DIR = Path(__file__).absolute().parent
14+
15+
16+
class EvaluatorType(Enum):
17+
RELEVANCE = "relevance"
18+
ANSWER_LENGTH = "answer_length"
19+
APOLOGY = "apology"
20+
DATASET = "dataset"
21+
22+
23+
def init_args() -> argparse.Namespace:
24+
parser = argparse.ArgumentParser(
25+
prog="run_evaluators",
26+
description="Evaluate with the prompt flow SDK",
27+
)
28+
parser.add_argument(
29+
"-t",
30+
"--type",
31+
default=EvaluatorType.RELEVANCE.value,
32+
choices=[t.value for t in EvaluatorType],
33+
help="Evaluator type",
34+
)
35+
parser.add_argument(
36+
"-v",
37+
"--verbose",
38+
action="store_true",
39+
help="Enable verbose mode",
40+
)
41+
return parser.parse_args()
42+
43+
44+
def run_relevance_evaluator(model_config):
45+
relevance_eval = RelevanceEvaluator(model_config)
46+
47+
relevance_score = relevance_eval(
48+
answer="The Alpine Explorer Tent is the most waterproof.",
49+
context="From the our product list,"
50+
" the alpine explorer tent is the most waterproof."
51+
" The Adventure Dining Table has higher weight.",
52+
question="Which tent is the most waterproof?",
53+
)
54+
55+
print(relevance_score)
56+
57+
58+
class AnswerLengthEvaluator:
59+
def __init__(self):
60+
pass
61+
62+
def __call__(self, *, answer: str, **kwargs):
63+
return {"answer_length": len(answer)}
64+
65+
66+
def run_answer_length_evaluator():
67+
evaluator = AnswerLengthEvaluator()
68+
answer_length = evaluator(answer="What is the speed of light?")
69+
print(answer_length)
70+
71+
72+
def get_apology_evaluator(model_config):
73+
return load_flow(
74+
source=f"{BASE_DIR}/apology.prompty",
75+
model={"configuration": model_config},
76+
)
77+
78+
79+
def run_apology_evaluator(model_config):
80+
apology_eval = get_apology_evaluator(model_config)
81+
82+
# load apology evaluator from prompty file using promptflow
83+
apology_score = apology_eval(
84+
question="Where can I get my car fixed?",
85+
answer="I'm sorry, I don't know that. Would you like me to look it up for you? Sorry for the inconvenience.",
86+
)
87+
print(apology_score)
88+
89+
90+
def run_test_dataset(model_config):
91+
result = evaluate(
92+
data=f"{BASE_DIR}/data.jsonl", # provide your data here
93+
evaluators={
94+
EvaluatorType.RELEVANCE.value: RelevanceEvaluator(model_config),
95+
EvaluatorType.ANSWER_LENGTH.value: AnswerLengthEvaluator(),
96+
EvaluatorType.APOLOGY.value: get_apology_evaluator(model_config),
97+
},
98+
# column mapping
99+
evaluator_config={
100+
"default": {"ground_truth": "${data.ground_truth}"},
101+
},
102+
# Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
103+
output_path=f"{BASE_DIR}/results.json",
104+
)
105+
print(result)
106+
107+
108+
if __name__ == "__main__":
109+
args = init_args()
110+
111+
# Set verbose mode
112+
if args.verbose:
113+
logging.basicConfig(level=logging.DEBUG)
114+
115+
load_dotenv()
116+
117+
model_config = AzureOpenAIModelConfiguration(
118+
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
119+
api_key=getenv("AZURE_OPENAI_API_KEY"),
120+
azure_deployment=getenv("AZURE_OPENAI_GPT_MODEL"),
121+
api_version=getenv("AZURE_OPENAI_API_VERSION"),
122+
)
123+
124+
if args.type == EvaluatorType.RELEVANCE.value:
125+
run_relevance_evaluator(model_config)
126+
elif args.type == EvaluatorType.ANSWER_LENGTH.value:
127+
run_answer_length_evaluator()
128+
elif args.type == EvaluatorType.APOLOGY.value:
129+
run_apology_evaluator(model_config)
130+
elif args.type == EvaluatorType.DATASET.value:
131+
run_test_dataset(model_config)
132+
else:
133+
print(f"Invalid evaluator type {args.type}")
134+
print(f"Please choose from {', '.join([t.value for t in EvaluatorType])}")
135+
exit(1)

poetry.lock

Lines changed: 96 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ azure-storage-blob = "^12.22.0"
2727
requests = "^2.32.3"
2828
opencv-python-headless = "^4.10.0.84"
2929
promptflow = "^1.15.0"
30+
promptflow-evals = "^0.3.2"
3031

3132
[tool.poetry.group.dev.dependencies]
3233
pre-commit = "^3.8.0"

requirements.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,18 +4,19 @@ streamlit==1.37.1
44
azure-cosmos==4.7.0
55
plotly==5.23.0
66
pandas==2.2.2
7-
langchain==0.2.12
8-
langchain-openai==0.1.20
9-
langchain-community==0.2.11
7+
langchain==0.2.14
8+
langchain-openai==0.1.22
9+
langchain-community==0.2.12
1010
azure-search-documents==11.5.1
1111
azure-identity==1.17.1
1212
azure-ai-documentintelligence==1.0.0b3
1313
azure-storage-blob==12.22.0
1414
requests==2.32.3
1515
promptflow==1.15.0
16+
promptflow-evals==0.3.2
1617

1718
# To run 99_streamlit_examples/pages/10_Object_Detection.py
18-
# ultralytics==8.2.77
19+
# ultralytics==8.2.82
1920

2021
# To run 99_streamlit_examples/pages/11_Pose_Estimation.py
2122
# mediapipe==0.10.14

0 commit comments

Comments
 (0)