From 49e668354979f9e8802781256fe2b6c4937e2d3d Mon Sep 17 00:00:00 2001 From: ks6088ts Date: Sun, 1 Sep 2024 09:27:11 +0900 Subject: [PATCH] hands on evaluators with prompt flow sdk --- .pre-commit-config.yaml | 2 +- apps/11_promptflow/README.md | 9 ++ apps/11_promptflow/evaluators/.gitignore | 1 + apps/11_promptflow/evaluators/apology.prompty | 25 ++++ apps/11_promptflow/evaluators/data.jsonl | 2 + apps/11_promptflow/evaluators/main.py | 135 ++++++++++++++++++ poetry.lock | 97 ++++++++++++- pyproject.toml | 1 + requirements.txt | 9 +- 9 files changed, 275 insertions(+), 6 deletions(-) create mode 100644 apps/11_promptflow/evaluators/.gitignore create mode 100644 apps/11_promptflow/evaluators/apology.prompty create mode 100644 apps/11_promptflow/evaluators/data.jsonl create mode 100644 apps/11_promptflow/evaluators/main.py diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 6078d7b..df21ac2 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -13,4 +13,4 @@ repos: rev: 24.2.0 hooks: - id: black -exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json' +exclude: 'generated/.*|artifacts/.*|.jsonl|.csv|.json|.prompty' diff --git a/apps/11_promptflow/README.md b/apps/11_promptflow/README.md index 48cf098..8fe60ff 100644 --- a/apps/11_promptflow/README.md +++ b/apps/11_promptflow/README.md @@ -273,6 +273,15 @@ $ pf run create \ $ pf run show-details --name $RUN_NAME ``` +### evaluators + +To guide you through working with evaluators, a helpful document is available at [Evaluate with the prompt flow SDK](https://learn.microsoft.com/azure/ai-studio/how-to/develop/flow-evaluate-sdk). + +```shell +# Show help +python apps/11_promptflow/evaluators/main.py --help +``` + ## References diff --git a/apps/11_promptflow/evaluators/.gitignore b/apps/11_promptflow/evaluators/.gitignore new file mode 100644 index 0000000..91a9c59 --- /dev/null +++ b/apps/11_promptflow/evaluators/.gitignore @@ -0,0 +1 @@ +results.json diff --git a/apps/11_promptflow/evaluators/apology.prompty b/apps/11_promptflow/evaluators/apology.prompty new file mode 100644 index 0000000..4fda999 --- /dev/null +++ b/apps/11_promptflow/evaluators/apology.prompty @@ -0,0 +1,25 @@ +--- +name: Apology Evaluator +description: Apology Evaluator for QA scenario +model: + api: chat + configuration: + type: azure_openai + connection: open_ai_connection + azure_deployment: gpt-4 + parameters: + temperature: 0.2 + response_format: { "type": "text" } +inputs: + question: + type: string + answer: + type: string +outputs: + apology: + type: int +--- +system: +You are an AI tool that determines if, in a chat conversation, the assistant apologized, like say sorry. +Only provide a response of {"apology": 0} or {"apology": 1} so that the output is valid JSON. +Give a apology of 1 if apologized in the chat conversation. \ No newline at end of file diff --git a/apps/11_promptflow/evaluators/data.jsonl b/apps/11_promptflow/evaluators/data.jsonl new file mode 100644 index 0000000..964838a --- /dev/null +++ b/apps/11_promptflow/evaluators/data.jsonl @@ -0,0 +1,2 @@ +{"answer": "Paris is the capital of France.", "context": "France is in Europe", "ground_truth": "Paris has been the capital of France since the 10th century and is known for its cultural and historical landmarks.", "question": "What is the capital of France?"} +{"answer": "I'm sorry, I don't know that. Would you like me to look it up for you?", "context": "Fixing car isn't supported.", "ground_truth": "We don't support car fixing service. I'm sorry, I don't know that. Would you like me to look it up for you?", "question": "Where can I get my car fixed?"} \ No newline at end of file diff --git a/apps/11_promptflow/evaluators/main.py b/apps/11_promptflow/evaluators/main.py new file mode 100644 index 0000000..2c4a5fa --- /dev/null +++ b/apps/11_promptflow/evaluators/main.py @@ -0,0 +1,135 @@ +import argparse +import logging +from enum import Enum +from os import getenv +from pathlib import Path + +from dotenv import load_dotenv +from promptflow.client import load_flow +from promptflow.core import AzureOpenAIModelConfiguration +from promptflow.evals.evaluate import evaluate +from promptflow.evals.evaluators import RelevanceEvaluator + +BASE_DIR = Path(__file__).absolute().parent + + +class EvaluatorType(Enum): + RELEVANCE = "relevance" + ANSWER_LENGTH = "answer_length" + APOLOGY = "apology" + DATASET = "dataset" + + +def init_args() -> argparse.Namespace: + parser = argparse.ArgumentParser( + prog="run_evaluators", + description="Evaluate with the prompt flow SDK", + ) + parser.add_argument( + "-t", + "--type", + default=EvaluatorType.RELEVANCE.value, + choices=[t.value for t in EvaluatorType], + help="Evaluator type", + ) + parser.add_argument( + "-v", + "--verbose", + action="store_true", + help="Enable verbose mode", + ) + return parser.parse_args() + + +def run_relevance_evaluator(model_config): + relevance_eval = RelevanceEvaluator(model_config) + + relevance_score = relevance_eval( + answer="The Alpine Explorer Tent is the most waterproof.", + context="From the our product list," + " the alpine explorer tent is the most waterproof." + " The Adventure Dining Table has higher weight.", + question="Which tent is the most waterproof?", + ) + + print(relevance_score) + + +class AnswerLengthEvaluator: + def __init__(self): + pass + + def __call__(self, *, answer: str, **kwargs): + return {"answer_length": len(answer)} + + +def run_answer_length_evaluator(): + evaluator = AnswerLengthEvaluator() + answer_length = evaluator(answer="What is the speed of light?") + print(answer_length) + + +def get_apology_evaluator(model_config): + return load_flow( + source=f"{BASE_DIR}/apology.prompty", + model={"configuration": model_config}, + ) + + +def run_apology_evaluator(model_config): + apology_eval = get_apology_evaluator(model_config) + + # load apology evaluator from prompty file using promptflow + apology_score = apology_eval( + question="Where can I get my car fixed?", + answer="I'm sorry, I don't know that. Would you like me to look it up for you? Sorry for the inconvenience.", + ) + print(apology_score) + + +def run_test_dataset(model_config): + result = evaluate( + data=f"{BASE_DIR}/data.jsonl", # provide your data here + evaluators={ + EvaluatorType.RELEVANCE.value: RelevanceEvaluator(model_config), + EvaluatorType.ANSWER_LENGTH.value: AnswerLengthEvaluator(), + EvaluatorType.APOLOGY.value: get_apology_evaluator(model_config), + }, + # column mapping + evaluator_config={ + "default": {"ground_truth": "${data.ground_truth}"}, + }, + # Optionally provide an output path to dump a json of metric summary, row level data and metric and studio URL + output_path=f"{BASE_DIR}/results.json", + ) + print(result) + + +if __name__ == "__main__": + args = init_args() + + # Set verbose mode + if args.verbose: + logging.basicConfig(level=logging.DEBUG) + + load_dotenv() + + model_config = AzureOpenAIModelConfiguration( + azure_endpoint=getenv("AZURE_OPENAI_ENDPOINT"), + api_key=getenv("AZURE_OPENAI_API_KEY"), + azure_deployment=getenv("AZURE_OPENAI_GPT_MODEL"), + api_version=getenv("AZURE_OPENAI_API_VERSION"), + ) + + if args.type == EvaluatorType.RELEVANCE.value: + run_relevance_evaluator(model_config) + elif args.type == EvaluatorType.ANSWER_LENGTH.value: + run_answer_length_evaluator() + elif args.type == EvaluatorType.APOLOGY.value: + run_apology_evaluator(model_config) + elif args.type == EvaluatorType.DATASET.value: + run_test_dataset(model_config) + else: + print(f"Invalid evaluator type {args.type}") + print(f"Please choose from {', '.join([t.value for t in EvaluatorType])}") + exit(1) diff --git a/poetry.lock b/poetry.lock index 73cc93a..f931365 100644 --- a/poetry.lock +++ b/poetry.lock @@ -119,6 +119,20 @@ yarl = ">=1.0,<2.0" [package.extras] speedups = ["Brotli", "aiodns (>=3.2.0)", "brotlicffi"] +[[package]] +name = "aiohttp-retry" +version = "2.8.3" +description = "Simple retry client for aiohttp" +optional = false +python-versions = ">=3.7" +files = [ + {file = "aiohttp_retry-2.8.3-py3-none-any.whl", hash = "sha256:3aeeead8f6afe48272db93ced9440cf4eda8b6fd7ee2abb25357b7eb28525b45"}, + {file = "aiohttp_retry-2.8.3.tar.gz", hash = "sha256:9a8e637e31682ad36e1ff9f8bcba912fcfc7d7041722bc901a4b948da4d71ea9"}, +] + +[package.dependencies] +aiohttp = "*" + [[package]] name = "aiosignal" version = "1.3.1" @@ -1539,6 +1553,20 @@ files = [ [package.dependencies] six = "*" +[[package]] +name = "isort" +version = "5.13.2" +description = "A Python utility / library to sort Python imports." +optional = false +python-versions = ">=3.8.0" +files = [ + {file = "isort-5.13.2-py3-none-any.whl", hash = "sha256:8ca5e72a8d85860d5a3fa69b8745237f2939afe12dbf656afbcb47fe72d947a6"}, + {file = "isort-5.13.2.tar.gz", hash = "sha256:48fdfcb9face5d58a4f6dde2e72a1fb8dcaf8ab26f95ab49fab84c2ddefb0109"}, +] + +[package.extras] +colors = ["colorama (>=0.4.6)"] + [[package]] name = "itsdangerous" version = "2.2.0" @@ -1749,6 +1777,20 @@ files = [ [package.dependencies] jsonpointer = ">=1.9" +[[package]] +name = "jsonpath-ng" +version = "1.6.1" +description = "A final implementation of JSONPath for Python that aims to be standard compliant, including arithmetic and binary comparison operators and providing clear AST for metaprogramming." +optional = false +python-versions = "*" +files = [ + {file = "jsonpath-ng-1.6.1.tar.gz", hash = "sha256:086c37ba4917304850bd837aeab806670224d3f038fe2833ff593a672ef0a5fa"}, + {file = "jsonpath_ng-1.6.1-py3-none-any.whl", hash = "sha256:8f22cd8273d7772eea9aaa84d922e0841aa36fdb8a2c6b7f6c3791a16a9bc0be"}, +] + +[package.dependencies] +ply = "*" + [[package]] name = "jsonpointer" version = "3.0.0" @@ -3254,6 +3296,17 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "ply" +version = "3.11" +description = "Python Lex & Yacc" +optional = false +python-versions = "*" +files = [ + {file = "ply-3.11-py2.py3-none-any.whl", hash = "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"}, + {file = "ply-3.11.tar.gz", hash = "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3"}, +] + [[package]] name = "portalocker" version = "2.10.1" @@ -3375,6 +3428,32 @@ waitress = ">=2.1.2,<3.0.0" executable = ["bs4", "pyarrow (>=14.0.1,<15.0.0)", "pyinstaller (>=5.13.2)", "streamlit (>=1.26.0)", "streamlit-quill (<0.1.0)"] pyarrow = ["pyarrow (>=14.0.1,<15.0.0)"] +[[package]] +name = "promptflow-evals" +version = "0.3.2" +description = "Prompt flow evals" +optional = false +python-versions = "<4.0,>=3.8" +files = [ + {file = "promptflow_evals-0.3.2-py3-none-any.whl", hash = "sha256:4a07f85db9b3564b654e5c380360c699fbc470acd2e15046c1b2f78df1730cb6"}, +] + +[package.dependencies] +aiohttp_retry = ">=2.8.3" +azure-core = ">=1.30.2" +azure-identity = ">=1.17.1" +isort = ">=5.13.2,<6.0.0" +jsonpath_ng = ">=1.5.0" +numpy = ">=1.22" +promptflow-core = ">=1.14.0,<2.0.0" +promptflow-devkit = ">=1.14.0,<2.0.0" +pyjwt = ">=2.8.0" +urllib3 = ">1.26.17" +websocket-client = ">=1.2.0" + +[package.extras] +azure = ["promptflow-azure (>=1.14.0,<2.0.0)"] + [[package]] name = "promptflow-tracing" version = "1.15.0" @@ -5044,6 +5123,22 @@ files = [ [package.extras] watchmedo = ["PyYAML (>=3.10)"] +[[package]] +name = "websocket-client" +version = "1.8.0" +description = "WebSocket client for Python with low level API options" +optional = false +python-versions = ">=3.8" +files = [ + {file = "websocket_client-1.8.0-py3-none-any.whl", hash = "sha256:17b44cc997f5c498e809b22cdf2d9c7a9e71c02c8cc2b6c56e7c2d1239bfa526"}, + {file = "websocket_client-1.8.0.tar.gz", hash = "sha256:3239df9f44da632f96012472805d40a23281a991027ce11d2f45a6f24ac4c3da"}, +] + +[package.extras] +docs = ["Sphinx (>=6.0)", "myst-parser (>=2.0.0)", "sphinx-rtd-theme (>=1.1.0)"] +optional = ["python-socks", "wsaccel"] +test = ["websockets"] + [[package]] name = "werkzeug" version = "3.0.4" @@ -5265,4 +5360,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "dfb3022b3b0c10d975b964eed5083f9471673f10a7255fc4a5d1cd17265ea8ab" +content-hash = "7845e52dadd7997beb2f689111d540313becacfb7c1a392575493de26f08efb2" diff --git a/pyproject.toml b/pyproject.toml index c804acf..10d02ee 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,6 +27,7 @@ azure-storage-blob = "^12.22.0" requests = "^2.32.3" opencv-python-headless = "^4.10.0.84" promptflow = "^1.15.0" +promptflow-evals = "^0.3.2" [tool.poetry.group.dev.dependencies] pre-commit = "^3.8.0" diff --git a/requirements.txt b/requirements.txt index 24c4e4a..3e02fd6 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,18 +4,19 @@ streamlit==1.37.1 azure-cosmos==4.7.0 plotly==5.23.0 pandas==2.2.2 -langchain==0.2.12 -langchain-openai==0.1.20 -langchain-community==0.2.11 +langchain==0.2.14 +langchain-openai==0.1.22 +langchain-community==0.2.12 azure-search-documents==11.5.1 azure-identity==1.17.1 azure-ai-documentintelligence==1.0.0b3 azure-storage-blob==12.22.0 requests==2.32.3 promptflow==1.15.0 +promptflow-evals==0.3.2 # To run 99_streamlit_examples/pages/10_Object_Detection.py -# ultralytics==8.2.77 +# ultralytics==8.2.82 # To run 99_streamlit_examples/pages/11_Pose_Estimation.py # mediapipe==0.10.14