|
67 | 67 | },
|
68 | 68 | {
|
69 | 69 | "cell_type": "code",
|
70 |
| - "execution_count": 31, |
| 70 | + "execution_count": null, |
71 | 71 | "metadata": {},
|
72 | 72 | "outputs": [],
|
73 | 73 | "source": [
|
|
79 | 79 | "from enum import Enum\n",
|
80 | 80 | "from typing import Any, List, Dict\n",
|
81 | 81 | "from pydantic import BaseModel, Field\n",
|
82 |
| - "from agents import Agent, Runner, set_default_openai_client\n", |
| 82 | + "from agents import Agent, Runner, set_default_openai_client, trace\n", |
83 | 83 | "\n",
|
84 | 84 | "openai_client: AsyncOpenAI | None = None\n",
|
85 | 85 | "\n",
|
|
384 | 384 | "cell_type": "markdown",
|
385 | 385 | "metadata": {},
|
386 | 386 | "source": [
|
387 |
| - "## 4. Using Evaluations to Arrive at these Agents\n", |
| 387 | + "## 4. Using Evaluations to Arrive at These Agents\n", |
388 | 388 | "\n",
|
389 | 389 | "Let's see how we used OpenAI Evals to tune agent instructions and pick the correct model to use. In order to do so we constructed a set of golden examples: each one contains original messages (developer message + user/assistant message) and the changes our optimization workflow should make. Here are two example of golden pairs that we used:"
|
390 | 390 | ]
|
391 | 391 | },
|
392 | 392 | {
|
393 | 393 | "cell_type": "code",
|
394 | 394 | "execution_count": null,
|
395 |
| - "metadata": { |
396 |
| - "vscode": { |
397 |
| - "languageId": "javascript" |
398 |
| - } |
399 |
| - }, |
| 395 | + "metadata": {}, |
400 | 396 | "outputs": [],
|
401 | 397 | "source": [
|
402 | 398 | "[\n",
|
403 |
| - " {\n", |
404 |
| - " \"focus\": \"contradiction_issues\",\n", |
405 |
| - " \"input_payload\": {\n", |
406 |
| - " \"developer_message\": \"Always answer in **English**.\\nNunca respondas en inglés.\",\n", |
407 |
| - " \"messages\": [\n", |
408 |
| - " {\n", |
409 |
| - " \"role\": \"user\",\n", |
410 |
| - " \"content\": \"¿Qué hora es?\"\n", |
411 |
| - " }\n", |
412 |
| - " ]\n", |
413 |
| - " },\n", |
414 |
| - " \"golden_output\": {\n", |
415 |
| - " \"changes\": true,\n", |
416 |
| - " \"new_developer_message\": \"Always answer **in English**.\",\n", |
417 |
| - " \"new_messages\": [\n", |
418 |
| - " {\n", |
419 |
| - " \"role\": \"user\",\n", |
420 |
| - " \"content\": \"¿Qué hora es?\"\n", |
421 |
| - " }\n", |
422 |
| - " ],\n", |
423 |
| - " \"contradiction_issues\": \"Developer message simultaneously insists on English and forbids it.\",\n", |
424 |
| - " \"few_shot_contradiction_issues\": \"\",\n", |
425 |
| - " \"format_issues\": \"\",\n", |
426 |
| - " \"general_improvements\": \"\"\n", |
427 |
| - " }\n", |
| 399 | + " {\n", |
| 400 | + " \"focus\": \"contradiction_issues\",\n", |
| 401 | + " \"input_payload\": {\n", |
| 402 | + " \"developer_message\": \"Always answer in **English**.\\nNunca respondas en inglés.\",\n", |
| 403 | + " \"messages\": [\n", |
| 404 | + " {\n", |
| 405 | + " \"role\": \"user\",\n", |
| 406 | + " \"content\": \"¿Qué hora es?\"\n", |
| 407 | + " }\n", |
| 408 | + " ]\n", |
428 | 409 | " },\n",
|
429 |
| - " {\n", |
430 |
| - " \"focus\": \"few_shot_contradiction_issues\",\n", |
431 |
| - " \"input_payload\": {\n", |
432 |
| - " \"developer_message\": \"Respond with **only 'yes' or 'no'** – no explanations.\",\n", |
433 |
| - " \"messages\": [\n", |
434 |
| - " {\n", |
435 |
| - " \"role\": \"user\",\n", |
436 |
| - " \"content\": \"Is the sky blue?\"\n", |
437 |
| - " },\n", |
438 |
| - " {\n", |
439 |
| - " \"role\": \"assistant\",\n", |
440 |
| - " \"content\": \"Yes, because wavelengths …\"\n", |
441 |
| - " },\n", |
442 |
| - " {\n", |
443 |
| - " \"role\": \"user\",\n", |
444 |
| - " \"content\": \"Is water wet?\"\n", |
445 |
| - " },\n", |
446 |
| - " {\n", |
447 |
| - " \"role\": \"assistant\",\n", |
448 |
| - " \"content\": \"Yes.\"\n", |
449 |
| - " }\n", |
450 |
| - " ]\n", |
451 |
| - " },\n", |
452 |
| - " \"golden_output\": {\n", |
453 |
| - " \"changes\": true,\n", |
454 |
| - " \"new_developer_message\": \"Respond with **only** the single word \\\"yes\\\" or \\\"no\\\".\",\n", |
455 |
| - " \"new_messages\": [\n", |
456 |
| - " {\n", |
457 |
| - " \"role\": \"user\",\n", |
458 |
| - " \"content\": \"Is the sky blue?\"\n", |
459 |
| - " },\n", |
460 |
| - " {\n", |
461 |
| - " \"role\": \"assistant\",\n", |
462 |
| - " \"content\": \"yes\"\n", |
463 |
| - " },\n", |
464 |
| - " {\n", |
465 |
| - " \"role\": \"user\",\n", |
466 |
| - " \"content\": \"Is water wet?\"\n", |
467 |
| - " },\n", |
468 |
| - " {\n", |
469 |
| - " \"role\": \"assistant\",\n", |
470 |
| - " \"content\": \"yes\"\n", |
471 |
| - " }\n", |
472 |
| - " ],\n", |
473 |
| - " \"contradiction_issues\": \"\",\n", |
474 |
| - " \"few_shot_contradiction_issues\": \"Assistant examples include explanations despite instruction not to.\",\n", |
475 |
| - " \"format_issues\": \"\",\n", |
476 |
| - " \"general_improvements\": \"\"\n", |
477 |
| - " }\n", |
| 410 | + " \"golden_output\": {\n", |
| 411 | + " \"changes\": True,\n", |
| 412 | + " \"new_developer_message\": \"Always answer **in English**.\",\n", |
| 413 | + " \"new_messages\": [\n", |
| 414 | + " {\n", |
| 415 | + " \"role\": \"user\",\n", |
| 416 | + " \"content\": \"¿Qué hora es?\"\n", |
| 417 | + " }\n", |
| 418 | + " ],\n", |
| 419 | + " \"contradiction_issues\": \"Developer message simultaneously insists on English and forbids it.\",\n", |
| 420 | + " \"few_shot_contradiction_issues\": \"\",\n", |
| 421 | + " \"format_issues\": \"\",\n", |
| 422 | + " \"general_improvements\": \"\"\n", |
478 | 423 | " }\n",
|
479 |
| - " ]" |
| 424 | + " },\n", |
| 425 | + " {\n", |
| 426 | + " \"focus\": \"few_shot_contradiction_issues\",\n", |
| 427 | + " \"input_payload\": {\n", |
| 428 | + " \"developer_message\": \"Respond with **only 'yes' or 'no'** – no explanations.\",\n", |
| 429 | + " \"messages\": [\n", |
| 430 | + " {\n", |
| 431 | + " \"role\": \"user\",\n", |
| 432 | + " \"content\": \"Is the sky blue?\"\n", |
| 433 | + " },\n", |
| 434 | + " {\n", |
| 435 | + " \"role\": \"assistant\",\n", |
| 436 | + " \"content\": \"Yes, because wavelengths …\"\n", |
| 437 | + " },\n", |
| 438 | + " {\n", |
| 439 | + " \"role\": \"user\",\n", |
| 440 | + " \"content\": \"Is water wet?\"\n", |
| 441 | + " },\n", |
| 442 | + " {\n", |
| 443 | + " \"role\": \"assistant\",\n", |
| 444 | + " \"content\": \"Yes.\"\n", |
| 445 | + " }\n", |
| 446 | + " ]\n", |
| 447 | + " },\n", |
| 448 | + " \"golden_output\": {\n", |
| 449 | + " \"changes\": True,\n", |
| 450 | + " \"new_developer_message\": \"Respond with **only** the single word \\\"yes\\\" or \\\"no\\\".\",\n", |
| 451 | + " \"new_messages\": [\n", |
| 452 | + " {\n", |
| 453 | + " \"role\": \"user\",\n", |
| 454 | + " \"content\": \"Is the sky blue?\"\n", |
| 455 | + " },\n", |
| 456 | + " {\n", |
| 457 | + " \"role\": \"assistant\",\n", |
| 458 | + " \"content\": \"yes\"\n", |
| 459 | + " },\n", |
| 460 | + " {\n", |
| 461 | + " \"role\": \"user\",\n", |
| 462 | + " \"content\": \"Is water wet?\"\n", |
| 463 | + " },\n", |
| 464 | + " {\n", |
| 465 | + " \"role\": \"assistant\",\n", |
| 466 | + " \"content\": \"yes\"\n", |
| 467 | + " }\n", |
| 468 | + " ],\n", |
| 469 | + " \"contradiction_issues\": \"\",\n", |
| 470 | + " \"few_shot_contradiction_issues\": \"Assistant examples include explanations despite instruction not to.\",\n", |
| 471 | + " \"format_issues\": \"\",\n", |
| 472 | + " \"general_improvements\": \"\"\n", |
| 473 | + " }\n", |
| 474 | + " }\n", |
| 475 | + "]" |
480 | 476 | ]
|
481 | 477 | },
|
482 | 478 | {
|
|
535 | 531 | " Returns a unified dict suitable for an API or endpoint.\n",
|
536 | 532 | " \"\"\"\n",
|
537 | 533 | "\n",
|
538 |
| - " # 1. Run all checkers in parallel (contradiction, format, fewshot if there are examples)\n", |
539 |
| - " tasks = [\n", |
540 |
| - " Runner.run(dev_contradiction_checker, developer_message),\n", |
541 |
| - " Runner.run(format_checker, developer_message),\n", |
542 |
| - " ]\n", |
543 |
| - " if messages:\n", |
544 |
| - " fs_input = {\n", |
545 |
| - " \"DEVELOPER_MESSAGE\": developer_message,\n", |
546 |
| - " \"USER_EXAMPLES\": [m.content for m in messages if m.role == \"user\"],\n", |
547 |
| - " \"ASSISTANT_EXAMPLES\": [m.content for m in messages if m.role == \"assistant\"],\n", |
548 |
| - " }\n", |
549 |
| - " tasks.append(Runner.run(fewshot_consistency_checker, json.dumps(fs_input)))\n", |
550 |
| - "\n", |
551 |
| - " results = await asyncio.gather(*tasks)\n", |
552 |
| - "\n", |
553 |
| - " # Unpack results\n", |
554 |
| - " cd_issues: Issues = results[0].final_output\n", |
555 |
| - " fi_issues: Issues = results[1].final_output\n", |
556 |
| - " fs_issues: FewShotIssues = results[2].final_output if messages else FewShotIssues.no_issues()\n", |
557 |
| - "\n", |
558 |
| - " # 3. Rewrites as needed\n", |
559 |
| - " final_prompt = developer_message\n", |
560 |
| - " if cd_issues.has_issues or fi_issues.has_issues:\n", |
561 |
| - " pr_input = {\n", |
562 |
| - " \"ORIGINAL_DEVELOPER_MESSAGE\": developer_message,\n", |
563 |
| - " \"CONTRADICTION_ISSUES\": cd_issues.model_dump(),\n", |
564 |
| - " \"FORMAT_ISSUES\": fi_issues.model_dump(),\n", |
565 |
| - " }\n", |
566 |
| - " pr_res = await Runner.run(dev_rewriter, json.dumps(pr_input))\n", |
567 |
| - " final_prompt = pr_res.final_output.new_developer_message\n", |
568 |
| - "\n", |
569 |
| - " final_messages: Union[List[\"ChatMessage\"], List[Dict[str, str]]] = messages\n", |
570 |
| - " if fs_issues.has_issues:\n", |
571 |
| - " mr_input = {\n", |
572 |
| - " \"NEW_DEVELOPER_MESSAGE\": final_prompt,\n", |
573 |
| - " \"ORIGINAL_MESSAGES\": _normalize_messages(messages),\n", |
574 |
| - " \"FEW_SHOT_ISSUES\": fs_issues.model_dump(),\n", |
575 |
| - " }\n", |
576 |
| - " mr_res = await Runner.run(fewshot_rewriter, json.dumps(mr_input))\n", |
577 |
| - " final_messages = mr_res.final_output.messages\n", |
578 |
| - "\n", |
579 |
| - " return {\n", |
580 |
| - " \"changes\": True,\n", |
581 |
| - " \"new_developer_message\": final_prompt,\n", |
582 |
| - " \"new_messages\": _normalize_messages(final_messages),\n", |
583 |
| - " \"contradiction_issues\": \"\\n\".join(cd_issues.issues),\n", |
584 |
| - " \"few_shot_contradiction_issues\": \"\\n\".join(fs_issues.issues),\n", |
585 |
| - " \"format_issues\": \"\\n\".join(fi_issues.issues),\n", |
586 |
| - " }" |
| 534 | + " with trace(\"optimize_prompt_workflow\"):\n", |
| 535 | + " # 1. Run all checkers in parallel (contradiction, format, fewshot if there are examples)\n", |
| 536 | + " tasks = [\n", |
| 537 | + " Runner.run(dev_contradiction_checker, developer_message),\n", |
| 538 | + " Runner.run(format_checker, developer_message),\n", |
| 539 | + " ]\n", |
| 540 | + " if messages:\n", |
| 541 | + " fs_input = {\n", |
| 542 | + " \"DEVELOPER_MESSAGE\": developer_message,\n", |
| 543 | + " \"USER_EXAMPLES\": [m.content for m in messages if m.role == \"user\"],\n", |
| 544 | + " \"ASSISTANT_EXAMPLES\": [m.content for m in messages if m.role == \"assistant\"],\n", |
| 545 | + " }\n", |
| 546 | + " tasks.append(Runner.run(fewshot_consistency_checker, json.dumps(fs_input)))\n", |
| 547 | + "\n", |
| 548 | + " results = await asyncio.gather(*tasks)\n", |
| 549 | + "\n", |
| 550 | + " # Unpack results\n", |
| 551 | + " cd_issues: Issues = results[0].final_output\n", |
| 552 | + " fi_issues: Issues = results[1].final_output\n", |
| 553 | + " fs_issues: FewShotIssues = results[2].final_output if messages else FewShotIssues.no_issues()\n", |
| 554 | + "\n", |
| 555 | + " # 3. Rewrites as needed\n", |
| 556 | + " final_prompt = developer_message\n", |
| 557 | + " if cd_issues.has_issues or fi_issues.has_issues:\n", |
| 558 | + " pr_input = {\n", |
| 559 | + " \"ORIGINAL_DEVELOPER_MESSAGE\": developer_message,\n", |
| 560 | + " \"CONTRADICTION_ISSUES\": cd_issues.model_dump(),\n", |
| 561 | + " \"FORMAT_ISSUES\": fi_issues.model_dump(),\n", |
| 562 | + " }\n", |
| 563 | + " pr_res = await Runner.run(dev_rewriter, json.dumps(pr_input))\n", |
| 564 | + " final_prompt = pr_res.final_output.new_developer_message\n", |
| 565 | + "\n", |
| 566 | + " final_messages: list[ChatMessage] | list[dict[str, str]] = messages\n", |
| 567 | + " if fs_issues.has_issues:\n", |
| 568 | + " mr_input = {\n", |
| 569 | + " \"NEW_DEVELOPER_MESSAGE\": final_prompt,\n", |
| 570 | + " \"ORIGINAL_MESSAGES\": _normalize_messages(messages),\n", |
| 571 | + " \"FEW_SHOT_ISSUES\": fs_issues.model_dump(),\n", |
| 572 | + " }\n", |
| 573 | + " mr_res = await Runner.run(fewshot_rewriter, json.dumps(mr_input))\n", |
| 574 | + " final_messages = mr_res.final_output.messages\n", |
| 575 | + "\n", |
| 576 | + " return {\n", |
| 577 | + " \"changes\": True,\n", |
| 578 | + " \"new_developer_message\": final_prompt,\n", |
| 579 | + " \"new_messages\": _normalize_messages(final_messages),\n", |
| 580 | + " \"contradiction_issues\": \"\\n\".join(cd_issues.issues),\n", |
| 581 | + " \"few_shot_contradiction_issues\": \"\\n\".join(fs_issues.issues),\n", |
| 582 | + " \"format_issues\": \"\\n\".join(fi_issues.issues),\n", |
| 583 | + " }" |
| 584 | + ] |
| 585 | + }, |
| 586 | + { |
| 587 | + "cell_type": "markdown", |
| 588 | + "metadata": {}, |
| 589 | + "source": [ |
| 590 | + "" |
587 | 591 | ]
|
588 | 592 | },
|
589 | 593 | {
|
|
0 commit comments