prompt adjustments

hendrytl · hendrytl · commit 24f53d44fd10 · 2025-08-26T06:28:35.000-07:00
diff --git a/examples/evaluation/use-cases/EvalsAPI_Audio_Inputs.ipynb b/examples/evaluation/use-cases/EvalsAPI_Audio_Inputs.ipynb
@@ -301,25 +301,29 @@
     "  \"name\": \"Reference answer audio model grader\",\n",
     "  \"model\": \"gpt-4o-audio-preview\",\n",
     "  \"input\": [\n",
-    "              {\n",
-    "                  \"role\": \"user\",\n",
-    "                  \"content\": [\n",
-    "                      {\n",
-    "                          \"type\": \"input_text\",\n",
-    "                          \"text\": \"Evaluate this audio clip to see if it reaches the same conclusion as the reference answer. Score the answer a 1 if it does, 0 if it does not. Reference answer: {{item.official_answer}}\",\n",
-    "                      },\n",
-    "                      {\n",
-    "                          \"type\": \"input_audio\",\n",
-    "                          \"input_audio\": {\n",
-    "                              \"data\": \"{{ sample.output_audio.data }}\",\n",
-    "                              \"format\": \"wav\",\n",
-    "                          },\n",
-    "                      },\n",
-    "                  ],\n",
-    "              },\n",
-    "          ],\n",
+    "        {\n",
+    "            \"role\": \"system\",\n",
+    "            \"content\": 'You are a helpful assistant that evaluates audio clips to judge whether they match a provided reference answer. The audio clip is the model''s response to the question. Respond ONLY with a single JSON object matching: {\"steps\":[{\"description\":\"string\",\"conclusion\":\"string\"}],\"result\":number}. Do not include any extra text. result must be a float in [0.0, 1.0].'\n",
+    "        },\n",
+    "        {\n",
+    "            \"role\": \"user\",\n",
+    "            \"content\": [\n",
+    "                {\n",
+    "                    \"type\": \"input_text\",\n",
+    "                    \"text\": \"Evaluate this audio clip to see if it reaches the same conclusion as the reference answer. Reference answer: {{item.official_answer}}\",\n",
+    "                },\n",
+    "                {\n",
+    "                    \"type\": \"input_audio\",\n",
+    "                    \"input_audio\": {\n",
+    "                        \"data\": \"{{ sample.output_audio.data }}\",\n",
+    "                        \"format\": \"wav\",\n",
+    "                    },\n",
+    "                },\n",
+    "            ],\n",
+    "        },\n",
+    "    ],\n",
     "          \"range\": [0, 1],\n",
-    "          \"pass_threshold\": 0.9,\n",
+    "          \"pass_threshold\": 0.6,\n",
     "}"
    ]
   },
@@ -385,14 +389,14 @@
     "sampling_messages = [\n",
     "    {\n",
     "        \"role\": \"system\",\n",
-    "        \"content\": \"You are a helpful and obedient assistant that can answer questions with audio input. You will be given an audio input containing a question and instructions on exactly how to answer. For example, if the user asks for a single word response, then you should only reply with a single word answer.\"\n",
+    "        \"content\": \"You are a helpful and obedient assistant that can answer questions with audio input. You will be given an audio input containing a question to answer.\"\n",
     "    },\n",
     "    {\n",
     "        \"role\": \"user\",\n",
     "        \"type\": \"message\",\n",
     "        \"content\": {\n",
     "            \"type\": \"input_text\",\n",
-    "            \"text\": \"Answer the following question by replying with a single word answer: 'valid' or 'invalid'.\"\n",
+    "            \"text\": \"Answer the following question by replying with brief reasoning statements and a conclusion with a single word answer: 'valid' or 'invalid'.\"\n",
     "        }\n",
     "    },\n",
     "    {\n",