Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,4 +13,4 @@ repos:
rev: 24.2.0
hooks:
- id: black
exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json'
exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json|.prompty'
9 changes: 9 additions & 0 deletions apps/11_promptflow/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,15 @@ $ pf run create \
$ pf run show-details --name $RUN_NAME
```

### evaluators

To guide you through working with evaluators, a helpful document is available at [Evaluate with the prompt flow SDK](https://learn.microsoft.com/azure/ai-studio/how-to/develop/flow-evaluate-sdk).

```shell
# Show help
python apps/11_promptflow/evaluators/main.py --help
```

<!-- TODO: rag, deployments -->

## References
Expand Down
1 change: 1 addition & 0 deletions apps/11_promptflow/evaluators/.gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
results.json
25 changes: 25 additions & 0 deletions apps/11_promptflow/evaluators/apology.prompty
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
---
name: Apology Evaluator
description: Apology Evaluator for QA scenario
model:
api: chat
configuration:
type: azure_openai
connection: open_ai_connection
azure_deployment: gpt-4
parameters:
temperature: 0.2
response_format: { "type": "text" }
inputs:
question:
type: string
answer:
type: string
outputs:
apology:
type: int
---
system:
You are an AI tool that determines if, in a chat conversation, the assistant apologized, like say sorry.
Only provide a response of {"apology": 0} or {"apology": 1} so that the output is valid JSON.
Give a apology of 1 if apologized in the chat conversation.
2 changes: 2 additions & 0 deletions apps/11_promptflow/evaluators/data.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"answer": "Paris is the capital of France.", "context": "France is in Europe", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks.", "question": "What is the capital of France?"}
{"answer": "I'm sorry, I don't know that. Would you like me to look it up for you?", "context": "Fixing car isn't supported.", "ground_truth": "We don't support car fixing service. I'm sorry, I don't know that. Would you like me to look it up for you?", "question": "Where can I get my car fixed?"}
135 changes: 135 additions & 0 deletions apps/11_promptflow/evaluators/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,135 @@
import argparse
import logging
from enum import Enum
from os import getenv
from pathlib import Path

from dotenv import load_dotenv
from promptflow.client import load_flow
from promptflow.core import AzureOpenAIModelConfiguration
from promptflow.evals.evaluate import evaluate
from promptflow.evals.evaluators import RelevanceEvaluator

BASE_DIR = Path(__file__).absolute().parent


class EvaluatorType(Enum):
RELEVANCE = "relevance"
ANSWER_LENGTH = "answer_length"
APOLOGY = "apology"
DATASET = "dataset"


def init_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
prog="run_evaluators",
description="Evaluate with the prompt flow SDK",
)
parser.add_argument(
"-t",
"--type",
default=EvaluatorType.RELEVANCE.value,
choices=[t.value for t in EvaluatorType],
help="Evaluator type",
)
parser.add_argument(
"-v",
"--verbose",
action="store_true",
help="Enable verbose mode",
)
return parser.parse_args()


def run_relevance_evaluator(model_config):
relevance_eval = RelevanceEvaluator(model_config)

relevance_score = relevance_eval(
answer="The Alpine Explorer Tent is the most waterproof.",
context="From the our product list,"
" the alpine explorer tent is the most waterproof."
" The Adventure Dining Table has higher weight.",
question="Which tent is the most waterproof?",
)

print(relevance_score)


class AnswerLengthEvaluator:
def __init__(self):
pass

def __call__(self, *, answer: str, **kwargs):
return {"answer_length": len(answer)}


def run_answer_length_evaluator():
evaluator = AnswerLengthEvaluator()
answer_length = evaluator(answer="What is the speed of light?")
print(answer_length)


def get_apology_evaluator(model_config):
return load_flow(
source=f"{BASE_DIR}/apology.prompty",
model={"configuration": model_config},
)


def run_apology_evaluator(model_config):
apology_eval = get_apology_evaluator(model_config)

# load apology evaluator from prompty file using promptflow
apology_score = apology_eval(
question="Where can I get my car fixed?",
answer="I'm sorry, I don't know that. Would you like me to look it up for you? Sorry for the inconvenience.",
)
print(apology_score)


def run_test_dataset(model_config):
result = evaluate(
data=f"{BASE_DIR}/data.jsonl", # provide your data here
evaluators={
EvaluatorType.RELEVANCE.value: RelevanceEvaluator(model_config),
EvaluatorType.ANSWER_LENGTH.value: AnswerLengthEvaluator(),
EvaluatorType.APOLOGY.value: get_apology_evaluator(model_config),
},
# column mapping
evaluator_config={
"default": {"ground_truth": "${data.ground_truth}"},
},
# Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL
output_path=f"{BASE_DIR}/results.json",
)
print(result)


if __name__ == "__main__":
args = init_args()

# Set verbose mode
if args.verbose:
logging.basicConfig(level=logging.DEBUG)

load_dotenv()

model_config = AzureOpenAIModelConfiguration(
azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"),
api_key=getenv("AZURE_OPENAI_API_KEY"),
azure_deployment=getenv("AZURE_OPENAI_GPT_MODEL"),
api_version=getenv("AZURE_OPENAI_API_VERSION"),
)

if args.type == EvaluatorType.RELEVANCE.value:
run_relevance_evaluator(model_config)
elif args.type == EvaluatorType.ANSWER_LENGTH.value:
run_answer_length_evaluator()
elif args.type == EvaluatorType.APOLOGY.value:
run_apology_evaluator(model_config)
elif args.type == EvaluatorType.DATASET.value:
run_test_dataset(model_config)
else:
print(f"Invalid evaluator type {args.type}")
print(f"Please choose from {', '.join([t.value for t in EvaluatorType])}")
exit(1)
97 changes: 96 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ azure-storage-blob = "^12.22.0"
requests = "^2.32.3"
opencv-python-headless = "^4.10.0.84"
promptflow = "^1.15.0"
promptflow-evals = "^0.3.2"

[tool.poetry.group.dev.dependencies]
pre-commit = "^3.8.0"
Expand Down
9 changes: 5 additions & 4 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,19 @@ streamlit==1.37.1
azure-cosmos==4.7.0
plotly==5.23.0
pandas==2.2.2
langchain==0.2.12
langchain-openai==0.1.20
langchain-community==0.2.11
langchain==0.2.14
langchain-openai==0.1.22
langchain-community==0.2.12
azure-search-documents==11.5.1
azure-identity==1.17.1
azure-ai-documentintelligence==1.0.0b3
azure-storage-blob==12.22.0
requests==2.32.3
promptflow==1.15.0
promptflow-evals==0.3.2

# To run 99_streamlit_examples/pages/10_Object_Detection.py
# ultralytics==8.2.77
# ultralytics==8.2.82

# To run 99_streamlit_examples/pages/11_Pose_Estimation.py
# mediapipe==0.10.14
Expand Down