Merge pull request #106 from ks6088ts-labs/feature/issue-69_promptflow-demo

ks6088ts · web-flow · commit df00d5120327 · 2024-08-31T19:00:10.000+09:00
fork eval-chat-math example
diff --git a/apps/11_promptflow/README.md b/apps/11_promptflow/README.md
@@ -34,6 +34,7 @@ $ pip install -r requirements.txt
 ## Examples
 
 [Prompt flow > Quick start](https://microsoft.github.io/promptflow/how-to-guides/quick-start.html) provides a quick start guide to Prompt flow.
+Some of the examples are extracted from [github.com/microsoft/promptflow/examples](https://github.com/microsoft/promptflow/tree/main/examples) to guide you through the basic usage of Prompt flow.
 
 ### [chat_minimal](https://github.com/microsoft/promptflow/tree/main/examples/flex-flows/chat-minimal)
 
@@ -203,7 +204,7 @@ $ pf run create \
 $ pf run show-details --name $RUN_NAME
 ```
 
-### chat-math-variant
+### [chat-math-variant](https://github.com/microsoft/promptflow/tree/main/examples/flows/chat/chat-math-variant)
 
 Tuning prompts using `variants` is a powerful feature in Prompt flow. It allows you to test different prompts and see which one works best for your use case.
 
@@ -229,7 +230,19 @@ $ pf run create \
 $ pf run show-details --name $RUN_NAME
 ```
 
+### [eval-chat-math](https://github.com/microsoft/promptflow/tree/main/examples/flows/evaluation/eval-chat-math)
+
+This example shows how to evaluate the answer of math questions, which can compare the output results with the standard answers numerically.
+Details are available in the [eval-chat-math/README.md](./eval-chat-math/README.md).
+To understand how to operate the flow in VS Code, you can refer to the [Build your high quality LLM apps with Prompt flow](https://www.youtube.com/watch?v=gcIe6nk2gA4).
+This video shows how to evaluate the answer of math questions and guide you to tune the prompts using variants.
+
+<!-- TODO: rag, tracing, deployments -->
+
 ## References
 
-- [Prompt flow > repos](https://github.com/microsoft/promptflow)
-- [Prompt flow > documents](https://microsoft.github.io/promptflow/)
+- [Repository](https://github.com/microsoft/promptflow)
+  - [examples](https://github.com/microsoft/promptflow/tree/main/examples)
+- [Documents](https://microsoft.github.io/promptflow/)
+  - [How-to Guides](https://microsoft.github.io/promptflow/how-to-guides/index.html)
+  - [Tutorials](https://microsoft.github.io/promptflow/tutorials/index.html#)
diff --git a/apps/11_promptflow/eval-chat-math/README.md b/apps/11_promptflow/eval-chat-math/README.md
@@ -0,0 +1,36 @@
+# Eval chat math
+
+This example shows how to evaluate the answer of math questions, which can compare the output results with the standard answers numerically.
+
+Learn more on corresponding [tutorials](../../../tutorials/flow-fine-tuning-evaluation/promptflow-quality-improvement.md)
+
+Tools used in this flow：
+- `python` tool
+
+## Prerequisites
+
+Install promptflow sdk and other dependencies in this folder:
+```bash
+pip install -r requirements.txt
+```
+
+### 1. Test flow with single line data
+
+Testing flow/node:
+```bash
+# test with default input value in flow.dag.yaml
+pf flow test --flow .
+
+# test with flow inputs
+pf flow test --flow . --inputs groundtruth=123 prediction=123
+
+# test node with inputs
+pf flow test --flow . --node line_process --inputs groundtruth=123 prediction=123
+```
+
+### 2. create flow run with multi line data
+There are two ways to evaluate an classification flow.
+
+```bash
+pf run create --flow . --data ./data.jsonl --stream
+```
diff --git a/apps/11_promptflow/eval-chat-math/aggregate.py b/apps/11_promptflow/eval-chat-math/aggregate.py
@@ -0,0 +1,35 @@
+from promptflow.core import log_metric, tool
+
+
+@tool
+def accuracy_aggregate(processed_results: list[int]):
+
+    num_exception = 0
+    num_correct = 0
+
+    for i in range(len(processed_results)):
+        if processed_results[i] == -1:
+            num_exception += 1
+        elif processed_results[i] == 1:
+            num_correct += 1
+
+    num_total = len(processed_results)
+    accuracy = round(1.0 * num_correct / num_total, 2)
+    error_rate = round(1.0 * num_exception / num_total, 2)
+
+    log_metric(key="accuracy", value=accuracy)
+    log_metric(key="error_rate", value=error_rate)
+
+    return {
+        "num_total": num_total,
+        "num_correct": num_correct,
+        "num_exception": num_exception,
+        "accuracy": accuracy,
+        "error_rate": error_rate,
+    }
+
+
+if __name__ == "__main__":
+    numbers = [1, 1, 1, 1, 0, -1, -1]
+    accuracy = accuracy_aggregate(numbers)
+    print("The accuracy is", accuracy)
diff --git a/apps/11_promptflow/eval-chat-math/data.jsonl b/apps/11_promptflow/eval-chat-math/data.jsonl
@@ -0,0 +1,3 @@
+{"groundtruth": "10","prediction": "10"}
+{"groundtruth": "253","prediction": "506"}
+{"groundtruth": "1/3","prediction": "2/6"}
diff --git a/apps/11_promptflow/eval-chat-math/flow.dag.yaml b/apps/11_promptflow/eval-chat-math/flow.dag.yaml
@@ -0,0 +1,35 @@
+$schema: https://azuremlschemas.azureedge.net/promptflow/latest/Flow.schema.json
+environment:
+  python_requirements_txt: requirements.txt
+inputs:
+  groundtruth:
+    type: string
+    default: "10"
+    is_chat_input: false
+  prediction:
+    type: string
+    default: "10"
+    is_chat_input: false
+outputs:
+  score:
+    type: string
+    reference: ${line_process.output}
+nodes:
+- name: line_process
+  type: python
+  source:
+    type: code
+    path: line_process.py
+  inputs:
+    groundtruth: ${inputs.groundtruth}
+    prediction: ${inputs.prediction}
+  use_variants: false
+- name: aggregate
+  type: python
+  source:
+    type: code
+    path: aggregate.py
+  inputs:
+    processed_results: ${line_process.output}
+  aggregation: true
+  use_variants: false
diff --git a/apps/11_promptflow/eval-chat-math/line_process.py b/apps/11_promptflow/eval-chat-math/line_process.py
@@ -0,0 +1,55 @@
+from promptflow.core import tool
+
+
+def string_to_number(raw_string: str) -> float:
+    """Try to parse the prediction string and groundtruth string to float number.
+    Support parse int, float, fraction and recognize non-numeric string with wrong format.
+    Wrong format cases: 'the answer is \box{2/3}', '0, 5, or any number greater than 11', '4/7//9'
+    """
+    float_number = 0.0
+    try:
+        float_number = float(raw_string)
+    except Exception:
+        if "/" in raw_string:
+            split_list = raw_string.split("/")
+            if len(split_list) == 2:
+                numerator, denominator = split_list
+                try:
+                    float_number = float(numerator) / float(denominator)
+                except Exception:
+                    return None
+            else:
+                return None
+        else:
+            return None
+    return float_number
+
+
+@tool
+def line_process(groundtruth: str, prediction: str) -> int:
+    pred_float = string_to_number(prediction)
+    """Early stop"""
+    if pred_float is None:
+        return -1
+    gt_float = string_to_number(groundtruth)
+    if gt_float is None:
+        return -1
+    """ both pred_float and gt_float are valid"""
+    if round(pred_float, 10) == round(gt_float, 10):
+        return 1
+    else:
+        return -1
+
+
+if __name__ == "__main__":
+    processed_result = line_process("3/5", "6/10")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("1/2", "0.5")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("3", "5")
+    print("The processed result is", processed_result)
+
+    processed_result = line_process("2/3", "the answer is \box{2/3}")
+    print("The processed result is", processed_result)
diff --git a/apps/11_promptflow/eval-chat-math/requirements.txt b/apps/11_promptflow/eval-chat-math/requirements.txt
@@ -0,0 +1,2 @@
+promptflow
+promptflow-tools

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+{"groundtruth": "10","prediction": "10"}`
	`2`	`+{"groundtruth": "253","prediction": "506"}`
	`3`	`+{"groundtruth": "1/3","prediction": "2/6"}`