diff --git a/README.md b/README.md index 147057f..955abff 100644 --- a/README.md +++ b/README.md @@ -3,3 +3,14 @@ This repo is not a plugin, and is meant to be used in conjunction with [Discourse AI](https://github.com/discourse/discourse-ai) plugin. See https://github.com/discourse/discourse-ai?tab=readme-ov-file#evals for more information. + + +#### Prompts + +Each eval config may contain a single or multiple test cases. Attributes (prompts, messages, followups) will be singular or plural accordingly. + +Single test case example, see +- https://github.com/discourse/discourse-ai-evals/blob/main/tool_calls/tool_calls_with_no_tool.yml +Multiple test case example, see +- https://github.com/discourse/discourse-ai-evals/blob/main/translate/translate_topic_title.yml (with judge) +- https://github.com/discourse/discourse-ai-evals/blob/main/tool_calls/tool_call_chains.yml (with multiple followups) diff --git a/tool_calls/tool_call_chains.yml b/tool_calls/tool_call_chains.yml new file mode 100644 index 0000000..46941d4 --- /dev/null +++ b/tool_calls/tool_call_chains.yml @@ -0,0 +1,37 @@ +id: tool_call_chains +name: Tool call chains +description: Call multiple tools in multiple tests +type: prompt +args: + - id: addition-test + name: Addition + description: Test the addition works in subsequent tool calls + temperature: 0 + stream: false + prompts: + - "You are a helpful bot" + messages: + - "Add 1 and 2" + tools: + - + name: "addition" + description: "Will add two numbers" + parameters: + - name: "text" + type: "string" + description: "the numbers to add" + required: true + followups: + - + tools: [] + message: + type: "tool" + id: ["tool_call", "id"] + name: ["tool_call", "name"] + content: "3" + - + tools: [] + message: + type: "user" + content: "add 4 to that" +expected_output_regex: "add.*4.*3" diff --git a/tool_calls/tool_calls_with_no_tool.yml b/tool_calls/tool_calls_with_no_tool.yml new file mode 100644 index 0000000..a8754ae --- /dev/null +++ b/tool_calls/tool_calls_with_no_tool.yml @@ -0,0 +1,27 @@ +id: tool_call_no_tools +name: Tool calls with no tool +description: Eval see what happens after a tool call comes back and we resubmit with no tools, does the llm get confused? +type: prompt +args: + output_thinking: true + temperature: 0 + prompt: "You are a helpful bot" + message: "echo the text sam and then respond to me with the text done" + tools: + - + name: "echo" + description: "will echo the text" + parameters: + - name: "text" + type: "string" + description: "the text to echo" + required: true + followup: + tools: [] + message: + type: "tool" + id: ["tool_call", "id"] + name: ["tool_call", "name"] + content: "content was echoed" +expected_output_regex: "one" +