|
301 | 301 | " \"name\": \"Reference answer audio model grader\",\n",
|
302 | 302 | " \"model\": \"gpt-4o-audio-preview\",\n",
|
303 | 303 | " \"input\": [\n",
|
304 |
| - " {\n", |
305 |
| - " \"role\": \"user\",\n", |
306 |
| - " \"content\": [\n", |
307 |
| - " {\n", |
308 |
| - " \"type\": \"input_text\",\n", |
309 |
| - " \"text\": \"Evaluate this audio clip to see if it reaches the same conclusion as the reference answer. Score the answer a 1 if it does, 0 if it does not. Reference answer: {{item.official_answer}}\",\n", |
310 |
| - " },\n", |
311 |
| - " {\n", |
312 |
| - " \"type\": \"input_audio\",\n", |
313 |
| - " \"input_audio\": {\n", |
314 |
| - " \"data\": \"{{ sample.output_audio.data }}\",\n", |
315 |
| - " \"format\": \"wav\",\n", |
316 |
| - " },\n", |
317 |
| - " },\n", |
318 |
| - " ],\n", |
319 |
| - " },\n", |
320 |
| - " ],\n", |
| 304 | + " {\n", |
| 305 | + " \"role\": \"system\",\n", |
| 306 | + " \"content\": 'You are a helpful assistant that evaluates audio clips to judge whether they match a provided reference answer. The audio clip is the model''s response to the question. Respond ONLY with a single JSON object matching: {\"steps\":[{\"description\":\"string\",\"conclusion\":\"string\"}],\"result\":number}. Do not include any extra text. result must be a float in [0.0, 1.0].'\n", |
| 307 | + " },\n", |
| 308 | + " {\n", |
| 309 | + " \"role\": \"user\",\n", |
| 310 | + " \"content\": [\n", |
| 311 | + " {\n", |
| 312 | + " \"type\": \"input_text\",\n", |
| 313 | + " \"text\": \"Evaluate this audio clip to see if it reaches the same conclusion as the reference answer. Reference answer: {{item.official_answer}}\",\n", |
| 314 | + " },\n", |
| 315 | + " {\n", |
| 316 | + " \"type\": \"input_audio\",\n", |
| 317 | + " \"input_audio\": {\n", |
| 318 | + " \"data\": \"{{ sample.output_audio.data }}\",\n", |
| 319 | + " \"format\": \"wav\",\n", |
| 320 | + " },\n", |
| 321 | + " },\n", |
| 322 | + " ],\n", |
| 323 | + " },\n", |
| 324 | + " ],\n", |
321 | 325 | " \"range\": [0, 1],\n",
|
322 |
| - " \"pass_threshold\": 0.9,\n", |
| 326 | + " \"pass_threshold\": 0.6,\n", |
323 | 327 | "}"
|
324 | 328 | ]
|
325 | 329 | },
|
|
385 | 389 | "sampling_messages = [\n",
|
386 | 390 | " {\n",
|
387 | 391 | " \"role\": \"system\",\n",
|
388 |
| - " \"content\": \"You are a helpful and obedient assistant that can answer questions with audio input. You will be given an audio input containing a question and instructions on exactly how to answer. For example, if the user asks for a single word response, then you should only reply with a single word answer.\"\n", |
| 392 | + " \"content\": \"You are a helpful and obedient assistant that can answer questions with audio input. You will be given an audio input containing a question to answer.\"\n", |
389 | 393 | " },\n",
|
390 | 394 | " {\n",
|
391 | 395 | " \"role\": \"user\",\n",
|
392 | 396 | " \"type\": \"message\",\n",
|
393 | 397 | " \"content\": {\n",
|
394 | 398 | " \"type\": \"input_text\",\n",
|
395 |
| - " \"text\": \"Answer the following question by replying with a single word answer: 'valid' or 'invalid'.\"\n", |
| 399 | + " \"text\": \"Answer the following question by replying with brief reasoning statements and a conclusion with a single word answer: 'valid' or 'invalid'.\"\n", |
396 | 400 | " }\n",
|
397 | 401 | " },\n",
|
398 | 402 | " {\n",
|
|
0 commit comments