DEV: Allow prompts to have multiple tests per config and followups with tools (#9)

nattsw · web-flow · commit 9d307bcc57cb · 2025-03-18T11:42:26.000+08:00
This commit builds on #8. - An eval may now have multiple tests denoted by `args` (multiple tests if args is an array, assume `args` is `tests`). See `tool_call_chains.yml`. - An eval may have one or many `followups`. See `tool_call_no_tools.yml` (one) or `tool_call_chains.yml` (many) - Followups may use tools or the typical prompt message we support
diff --git a/README.md b/README.md
@@ -3,3 +3,14 @@
 This repo is not a plugin, and is meant to be used in conjunction with [Discourse AI](https://github.com/discourse/discourse-ai) plugin.
 
 See https://github.com/discourse/discourse-ai?tab=readme-ov-file#evals for more information.
+
+
+#### Prompts
+
+Each eval config may contain a single or multiple test cases. Attributes (prompts, messages, followups) will be singular or plural accordingly. 
+
+Single test case example, see 
+- https://github.com/discourse/discourse-ai-evals/blob/main/tool_calls/tool_calls_with_no_tool.yml  
+Multiple test case example, see 
+- https://github.com/discourse/discourse-ai-evals/blob/main/translate/translate_topic_title.yml (with judge)
+- https://github.com/discourse/discourse-ai-evals/blob/main/tool_calls/tool_call_chains.yml (with multiple followups)
diff --git a/tool_calls/tool_call_chains.yml b/tool_calls/tool_call_chains.yml
@@ -0,0 +1,37 @@
+id: tool_call_chains
+name: Tool call chains
+description: Call multiple tools in multiple tests
+type: prompt
+args:
+  - id: addition-test
+    name: Addition
+    description: Test the addition works in subsequent tool calls
+    temperature: 0
+    stream: false
+    prompts:
+      - "You are a helpful bot"
+    messages:
+      - "Add 1 and 2"
+    tools:
+      -
+        name: "addition"
+        description: "Will add two numbers"
+        parameters:
+          - name: "text"
+            type: "string"
+            description: "the numbers to add"
+            required: true
+    followups:
+      -
+        tools: []
+        message:
+          type: "tool"
+          id: ["tool_call", "id"]
+          name: ["tool_call", "name"]
+          content: "3"
+      -
+        tools: []
+        message:
+          type: "user"
+          content: "add 4 to that"
+expected_output_regex: "add.*4.*3"
diff --git a/tool_calls/tool_calls_with_no_tool.yml b/tool_calls/tool_calls_with_no_tool.yml
@@ -0,0 +1,27 @@
+id: tool_call_no_tools
+name: Tool calls with no tool
+description: Eval see what happens after a tool call comes back and we resubmit with no tools, does the llm get confused?
+type: prompt
+args:
+  output_thinking: true
+  temperature: 0
+  prompt: "You are a helpful bot"
+  message: "echo the text sam and then respond to me with the text done"
+  tools:
+    -
+      name: "echo"
+      description: "will echo the text"
+      parameters:
+        - name: "text"
+          type: "string"
+          description: "the text to echo"
+          required: true
+  followup:
+    tools: []
+    message:
+      type: "tool"
+      id: ["tool_call", "id"]
+      name: ["tool_call", "name"]
+      content: "content was echoed"
+expected_output_regex: "one"
+