diff --git a/apps/11_promptflow/README.md b/apps/11_promptflow/README.md index ebedecf..3300edb 100644 --- a/apps/11_promptflow/README.md +++ b/apps/11_promptflow/README.md @@ -34,6 +34,7 @@ $ pip install -r requirements.txt ## Examples [Prompt flow > Quick start](https://microsoft.github.io/promptflow/how-to-guides/quick-start.html) provides a quick start guide to Prompt flow. +Some of the examples are extracted from [github.com/microsoft/promptflow/examples](https://github.com/microsoft/promptflow/tree/main/examples) to guide you through the basic usage of Prompt flow. ### [chat_minimal](https://github.com/microsoft/promptflow/tree/main/examples/flex-flows/chat-minimal) @@ -203,7 +204,7 @@ $ pf run create \ $ pf run show-details --name $RUN_NAME ``` -### chat-math-variant +### [chat-math-variant](https://github.com/microsoft/promptflow/tree/main/examples/flows/chat/chat-math-variant) Tuning prompts using `variants` is a powerful feature in Prompt flow. It allows you to test different prompts and see which one works best for your use case. @@ -229,7 +230,19 @@ $ pf run create \ $ pf run show-details --name $RUN_NAME ``` +### [eval-chat-math](https://github.com/microsoft/promptflow/tree/main/examples/flows/evaluation/eval-chat-math) + +This example shows how to evaluate the answer of math questions, which can compare the output results with the standard answers numerically. +Details are available in the [eval-chat-math/README.md](./eval-chat-math/README.md). +To understand how to operate the flow in VS Code, you can refer to the [Build your high quality LLM apps with Prompt flow](https://www.youtube.com/watch?v=gcIe6nk2gA4). +This video shows how to evaluate the answer of math questions and guide you to tune the prompts using variants. + + + ## References -- [Prompt flow > repos](https://github.com/microsoft/promptflow) -- [Prompt flow > documents](https://microsoft.github.io/promptflow/) +- [Repository](https://github.com/microsoft/promptflow) + - [examples](https://github.com/microsoft/promptflow/tree/main/examples) +- [Documents](https://microsoft.github.io/promptflow/) + - [How-to Guides](https://microsoft.github.io/promptflow/how-to-guides/index.html) + - [Tutorials](https://microsoft.github.io/promptflow/tutorials/index.html#) diff --git a/apps/11_promptflow/eval-chat-math/README.md b/apps/11_promptflow/eval-chat-math/README.md new file mode 100644 index 0000000..d956694 --- /dev/null +++ b/apps/11_promptflow/eval-chat-math/README.md @@ -0,0 +1,36 @@ +# Eval chat math + +This example shows how to evaluate the answer of math questions, which can compare the output results with the standard answers numerically. + +Learn more on corresponding [tutorials](../../../tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md) + +Tools used in this flow: +- `python` tool + +## Prerequisites + +Install promptflow sdk and other dependencies in this folder: +```bash +pip install -r requirements.txt +``` + +### 1. Test flow with single line data + +Testing flow/node: +```bash +# test with default input value in flow.dag.yaml +pf flow test --flow . + +# test with flow inputs +pf flow test --flow . --inputs groundtruth=123 prediction=123 + +# test node with inputs +pf flow test --flow . --node line_process --inputs groundtruth=123 prediction=123 +``` + +### 2. create flow run with multi line data +There are two ways to evaluate an classification flow. + +```bash +pf run create --flow . --data ./data.jsonl --stream +``` diff --git a/apps/11_promptflow/eval-chat-math/aggregate.py b/apps/11_promptflow/eval-chat-math/aggregate.py new file mode 100644 index 0000000..62291fc --- /dev/null +++ b/apps/11_promptflow/eval-chat-math/aggregate.py @@ -0,0 +1,35 @@ +from promptflow.core import log_metric, tool + + +@tool +def accuracy_aggregate(processed_results: list[int]): + + num_exception = 0 + num_correct = 0 + + for i in range(len(processed_results)): + if processed_results[i] == -1: + num_exception += 1 + elif processed_results[i] == 1: + num_correct += 1 + + num_total = len(processed_results) + accuracy = round(1.0 * num_correct / num_total, 2) + error_rate = round(1.0 * num_exception / num_total, 2) + + log_metric(key="accuracy", value=accuracy) + log_metric(key="error_rate", value=error_rate) + + return { + "num_total": num_total, + "num_correct": num_correct, + "num_exception": num_exception, + "accuracy": accuracy, + "error_rate": error_rate, + } + + +if __name__ == "__main__": + numbers = [1, 1, 1, 1, 0, -1, -1] + accuracy = accuracy_aggregate(numbers) + print("The accuracy is", accuracy) diff --git a/apps/11_promptflow/eval-chat-math/data.jsonl b/apps/11_promptflow/eval-chat-math/data.jsonl new file mode 100644 index 0000000..d1e184d --- /dev/null +++ b/apps/11_promptflow/eval-chat-math/data.jsonl @@ -0,0 +1,3 @@ +{"groundtruth": "10","prediction": "10"} +{"groundtruth": "253","prediction": "506"} +{"groundtruth": "1/3","prediction": "2/6"} \ No newline at end of file diff --git a/apps/11_promptflow/eval-chat-math/flow.dag.yaml b/apps/11_promptflow/eval-chat-math/flow.dag.yaml new file mode 100644 index 0000000..10b108b --- /dev/null +++ b/apps/11_promptflow/eval-chat-math/flow.dag.yaml @@ -0,0 +1,35 @@ +$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json +environment: + python_requirements_txt: requirements.txt +inputs: + groundtruth: + type: string + default: "10" + is_chat_input: false + prediction: + type: string + default: "10" + is_chat_input: false +outputs: + score: + type: string + reference: ${line_process.output} +nodes: +- name: line_process + type: python + source: + type: code + path: line_process.py + inputs: + groundtruth: ${inputs.groundtruth} + prediction: ${inputs.prediction} + use_variants: false +- name: aggregate + type: python + source: + type: code + path: aggregate.py + inputs: + processed_results: ${line_process.output} + aggregation: true + use_variants: false diff --git a/apps/11_promptflow/eval-chat-math/line_process.py b/apps/11_promptflow/eval-chat-math/line_process.py new file mode 100644 index 0000000..f6ffb19 --- /dev/null +++ b/apps/11_promptflow/eval-chat-math/line_process.py @@ -0,0 +1,55 @@ +from promptflow.core import tool + + +def string_to_number(raw_string: str) -> float: + """Try to parse the prediction string and groundtruth string to float number. + Support parse int, float, fraction and recognize non-numeric string with wrong format. + Wrong format cases: 'the answer is \box{2/3}', '0, 5, or any number greater than 11', '4/7//9' + """ + float_number = 0.0 + try: + float_number = float(raw_string) + except Exception: + if "/" in raw_string: + split_list = raw_string.split("/") + if len(split_list) == 2: + numerator, denominator = split_list + try: + float_number = float(numerator) / float(denominator) + except Exception: + return None + else: + return None + else: + return None + return float_number + + +@tool +def line_process(groundtruth: str, prediction: str) -> int: + pred_float = string_to_number(prediction) + """Early stop""" + if pred_float is None: + return -1 + gt_float = string_to_number(groundtruth) + if gt_float is None: + return -1 + """ both pred_float and gt_float are valid""" + if round(pred_float, 10) == round(gt_float, 10): + return 1 + else: + return -1 + + +if __name__ == "__main__": + processed_result = line_process("3/5", "6/10") + print("The processed result is", processed_result) + + processed_result = line_process("1/2", "0.5") + print("The processed result is", processed_result) + + processed_result = line_process("3", "5") + print("The processed result is", processed_result) + + processed_result = line_process("2/3", "the answer is \box{2/3}") + print("The processed result is", processed_result) diff --git a/apps/11_promptflow/eval-chat-math/requirements.txt b/apps/11_promptflow/eval-chat-math/requirements.txt new file mode 100644 index 0000000..ea9e957 --- /dev/null +++ b/apps/11_promptflow/eval-chat-math/requirements.txt @@ -0,0 +1,2 @@ +promptflow +promptflow-tools