Skip to content

Commit 45aff83

Browse files
committed
traces
1 parent aed0dcf commit 45aff83

File tree

2 files changed

+136
-132
lines changed

2 files changed

+136
-132
lines changed

examples/Optimize Prompts.ipynb

Lines changed: 136 additions & 132 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@
6767
},
6868
{
6969
"cell_type": "code",
70-
"execution_count": 31,
70+
"execution_count": null,
7171
"metadata": {},
7272
"outputs": [],
7373
"source": [
@@ -79,7 +79,7 @@
7979
"from enum import Enum\n",
8080
"from typing import Any, List, Dict\n",
8181
"from pydantic import BaseModel, Field\n",
82-
"from agents import Agent, Runner, set_default_openai_client\n",
82+
"from agents import Agent, Runner, set_default_openai_client, trace\n",
8383
"\n",
8484
"openai_client: AsyncOpenAI | None = None\n",
8585
"\n",
@@ -384,99 +384,95 @@
384384
"cell_type": "markdown",
385385
"metadata": {},
386386
"source": [
387-
"## 4. Using Evaluations to Arrive at these Agents\n",
387+
"## 4. Using Evaluations to Arrive at These Agents\n",
388388
"\n",
389389
"Let's see how we used OpenAI Evals to tune agent instructions and pick the correct model to use. In order to do so we constructed a set of golden examples: each one contains original messages (developer message + user/assistant message) and the changes our optimization workflow should make. Here are two example of golden pairs that we used:"
390390
]
391391
},
392392
{
393393
"cell_type": "code",
394394
"execution_count": null,
395-
"metadata": {
396-
"vscode": {
397-
"languageId": "javascript"
398-
}
399-
},
395+
"metadata": {},
400396
"outputs": [],
401397
"source": [
402398
"[\n",
403-
" {\n",
404-
" \"focus\": \"contradiction_issues\",\n",
405-
" \"input_payload\": {\n",
406-
" \"developer_message\": \"Always answer in **English**.\\nNunca respondas en inglés.\",\n",
407-
" \"messages\": [\n",
408-
" {\n",
409-
" \"role\": \"user\",\n",
410-
" \"content\": \"¿Qué hora es?\"\n",
411-
" }\n",
412-
" ]\n",
413-
" },\n",
414-
" \"golden_output\": {\n",
415-
" \"changes\": true,\n",
416-
" \"new_developer_message\": \"Always answer **in English**.\",\n",
417-
" \"new_messages\": [\n",
418-
" {\n",
419-
" \"role\": \"user\",\n",
420-
" \"content\": \"¿Qué hora es?\"\n",
421-
" }\n",
422-
" ],\n",
423-
" \"contradiction_issues\": \"Developer message simultaneously insists on English and forbids it.\",\n",
424-
" \"few_shot_contradiction_issues\": \"\",\n",
425-
" \"format_issues\": \"\",\n",
426-
" \"general_improvements\": \"\"\n",
427-
" }\n",
399+
" {\n",
400+
" \"focus\": \"contradiction_issues\",\n",
401+
" \"input_payload\": {\n",
402+
" \"developer_message\": \"Always answer in **English**.\\nNunca respondas en inglés.\",\n",
403+
" \"messages\": [\n",
404+
" {\n",
405+
" \"role\": \"user\",\n",
406+
" \"content\": \"¿Qué hora es?\"\n",
407+
" }\n",
408+
" ]\n",
428409
" },\n",
429-
" {\n",
430-
" \"focus\": \"few_shot_contradiction_issues\",\n",
431-
" \"input_payload\": {\n",
432-
" \"developer_message\": \"Respond with **only 'yes' or 'no'** – no explanations.\",\n",
433-
" \"messages\": [\n",
434-
" {\n",
435-
" \"role\": \"user\",\n",
436-
" \"content\": \"Is the sky blue?\"\n",
437-
" },\n",
438-
" {\n",
439-
" \"role\": \"assistant\",\n",
440-
" \"content\": \"Yes, because wavelengths …\"\n",
441-
" },\n",
442-
" {\n",
443-
" \"role\": \"user\",\n",
444-
" \"content\": \"Is water wet?\"\n",
445-
" },\n",
446-
" {\n",
447-
" \"role\": \"assistant\",\n",
448-
" \"content\": \"Yes.\"\n",
449-
" }\n",
450-
" ]\n",
451-
" },\n",
452-
" \"golden_output\": {\n",
453-
" \"changes\": true,\n",
454-
" \"new_developer_message\": \"Respond with **only** the single word \\\"yes\\\" or \\\"no\\\".\",\n",
455-
" \"new_messages\": [\n",
456-
" {\n",
457-
" \"role\": \"user\",\n",
458-
" \"content\": \"Is the sky blue?\"\n",
459-
" },\n",
460-
" {\n",
461-
" \"role\": \"assistant\",\n",
462-
" \"content\": \"yes\"\n",
463-
" },\n",
464-
" {\n",
465-
" \"role\": \"user\",\n",
466-
" \"content\": \"Is water wet?\"\n",
467-
" },\n",
468-
" {\n",
469-
" \"role\": \"assistant\",\n",
470-
" \"content\": \"yes\"\n",
471-
" }\n",
472-
" ],\n",
473-
" \"contradiction_issues\": \"\",\n",
474-
" \"few_shot_contradiction_issues\": \"Assistant examples include explanations despite instruction not to.\",\n",
475-
" \"format_issues\": \"\",\n",
476-
" \"general_improvements\": \"\"\n",
477-
" }\n",
410+
" \"golden_output\": {\n",
411+
" \"changes\": True,\n",
412+
" \"new_developer_message\": \"Always answer **in English**.\",\n",
413+
" \"new_messages\": [\n",
414+
" {\n",
415+
" \"role\": \"user\",\n",
416+
" \"content\": \"¿Qué hora es?\"\n",
417+
" }\n",
418+
" ],\n",
419+
" \"contradiction_issues\": \"Developer message simultaneously insists on English and forbids it.\",\n",
420+
" \"few_shot_contradiction_issues\": \"\",\n",
421+
" \"format_issues\": \"\",\n",
422+
" \"general_improvements\": \"\"\n",
478423
" }\n",
479-
" ]"
424+
" },\n",
425+
" {\n",
426+
" \"focus\": \"few_shot_contradiction_issues\",\n",
427+
" \"input_payload\": {\n",
428+
" \"developer_message\": \"Respond with **only 'yes' or 'no'** – no explanations.\",\n",
429+
" \"messages\": [\n",
430+
" {\n",
431+
" \"role\": \"user\",\n",
432+
" \"content\": \"Is the sky blue?\"\n",
433+
" },\n",
434+
" {\n",
435+
" \"role\": \"assistant\",\n",
436+
" \"content\": \"Yes, because wavelengths …\"\n",
437+
" },\n",
438+
" {\n",
439+
" \"role\": \"user\",\n",
440+
" \"content\": \"Is water wet?\"\n",
441+
" },\n",
442+
" {\n",
443+
" \"role\": \"assistant\",\n",
444+
" \"content\": \"Yes.\"\n",
445+
" }\n",
446+
" ]\n",
447+
" },\n",
448+
" \"golden_output\": {\n",
449+
" \"changes\": True,\n",
450+
" \"new_developer_message\": \"Respond with **only** the single word \\\"yes\\\" or \\\"no\\\".\",\n",
451+
" \"new_messages\": [\n",
452+
" {\n",
453+
" \"role\": \"user\",\n",
454+
" \"content\": \"Is the sky blue?\"\n",
455+
" },\n",
456+
" {\n",
457+
" \"role\": \"assistant\",\n",
458+
" \"content\": \"yes\"\n",
459+
" },\n",
460+
" {\n",
461+
" \"role\": \"user\",\n",
462+
" \"content\": \"Is water wet?\"\n",
463+
" },\n",
464+
" {\n",
465+
" \"role\": \"assistant\",\n",
466+
" \"content\": \"yes\"\n",
467+
" }\n",
468+
" ],\n",
469+
" \"contradiction_issues\": \"\",\n",
470+
" \"few_shot_contradiction_issues\": \"Assistant examples include explanations despite instruction not to.\",\n",
471+
" \"format_issues\": \"\",\n",
472+
" \"general_improvements\": \"\"\n",
473+
" }\n",
474+
" }\n",
475+
"]"
480476
]
481477
},
482478
{
@@ -535,55 +531,63 @@
535531
" Returns a unified dict suitable for an API or endpoint.\n",
536532
" \"\"\"\n",
537533
"\n",
538-
" # 1. Run all checkers in parallel (contradiction, format, fewshot if there are examples)\n",
539-
" tasks = [\n",
540-
" Runner.run(dev_contradiction_checker, developer_message),\n",
541-
" Runner.run(format_checker, developer_message),\n",
542-
" ]\n",
543-
" if messages:\n",
544-
" fs_input = {\n",
545-
" \"DEVELOPER_MESSAGE\": developer_message,\n",
546-
" \"USER_EXAMPLES\": [m.content for m in messages if m.role == \"user\"],\n",
547-
" \"ASSISTANT_EXAMPLES\": [m.content for m in messages if m.role == \"assistant\"],\n",
548-
" }\n",
549-
" tasks.append(Runner.run(fewshot_consistency_checker, json.dumps(fs_input)))\n",
550-
"\n",
551-
" results = await asyncio.gather(*tasks)\n",
552-
"\n",
553-
" # Unpack results\n",
554-
" cd_issues: Issues = results[0].final_output\n",
555-
" fi_issues: Issues = results[1].final_output\n",
556-
" fs_issues: FewShotIssues = results[2].final_output if messages else FewShotIssues.no_issues()\n",
557-
"\n",
558-
" # 3. Rewrites as needed\n",
559-
" final_prompt = developer_message\n",
560-
" if cd_issues.has_issues or fi_issues.has_issues:\n",
561-
" pr_input = {\n",
562-
" \"ORIGINAL_DEVELOPER_MESSAGE\": developer_message,\n",
563-
" \"CONTRADICTION_ISSUES\": cd_issues.model_dump(),\n",
564-
" \"FORMAT_ISSUES\": fi_issues.model_dump(),\n",
565-
" }\n",
566-
" pr_res = await Runner.run(dev_rewriter, json.dumps(pr_input))\n",
567-
" final_prompt = pr_res.final_output.new_developer_message\n",
568-
"\n",
569-
" final_messages: Union[List[\"ChatMessage\"], List[Dict[str, str]]] = messages\n",
570-
" if fs_issues.has_issues:\n",
571-
" mr_input = {\n",
572-
" \"NEW_DEVELOPER_MESSAGE\": final_prompt,\n",
573-
" \"ORIGINAL_MESSAGES\": _normalize_messages(messages),\n",
574-
" \"FEW_SHOT_ISSUES\": fs_issues.model_dump(),\n",
575-
" }\n",
576-
" mr_res = await Runner.run(fewshot_rewriter, json.dumps(mr_input))\n",
577-
" final_messages = mr_res.final_output.messages\n",
578-
"\n",
579-
" return {\n",
580-
" \"changes\": True,\n",
581-
" \"new_developer_message\": final_prompt,\n",
582-
" \"new_messages\": _normalize_messages(final_messages),\n",
583-
" \"contradiction_issues\": \"\\n\".join(cd_issues.issues),\n",
584-
" \"few_shot_contradiction_issues\": \"\\n\".join(fs_issues.issues),\n",
585-
" \"format_issues\": \"\\n\".join(fi_issues.issues),\n",
586-
" }"
534+
" with trace(\"optimize_prompt_workflow\"):\n",
535+
" # 1. Run all checkers in parallel (contradiction, format, fewshot if there are examples)\n",
536+
" tasks = [\n",
537+
" Runner.run(dev_contradiction_checker, developer_message),\n",
538+
" Runner.run(format_checker, developer_message),\n",
539+
" ]\n",
540+
" if messages:\n",
541+
" fs_input = {\n",
542+
" \"DEVELOPER_MESSAGE\": developer_message,\n",
543+
" \"USER_EXAMPLES\": [m.content for m in messages if m.role == \"user\"],\n",
544+
" \"ASSISTANT_EXAMPLES\": [m.content for m in messages if m.role == \"assistant\"],\n",
545+
" }\n",
546+
" tasks.append(Runner.run(fewshot_consistency_checker, json.dumps(fs_input)))\n",
547+
"\n",
548+
" results = await asyncio.gather(*tasks)\n",
549+
"\n",
550+
" # Unpack results\n",
551+
" cd_issues: Issues = results[0].final_output\n",
552+
" fi_issues: Issues = results[1].final_output\n",
553+
" fs_issues: FewShotIssues = results[2].final_output if messages else FewShotIssues.no_issues()\n",
554+
"\n",
555+
" # 3. Rewrites as needed\n",
556+
" final_prompt = developer_message\n",
557+
" if cd_issues.has_issues or fi_issues.has_issues:\n",
558+
" pr_input = {\n",
559+
" \"ORIGINAL_DEVELOPER_MESSAGE\": developer_message,\n",
560+
" \"CONTRADICTION_ISSUES\": cd_issues.model_dump(),\n",
561+
" \"FORMAT_ISSUES\": fi_issues.model_dump(),\n",
562+
" }\n",
563+
" pr_res = await Runner.run(dev_rewriter, json.dumps(pr_input))\n",
564+
" final_prompt = pr_res.final_output.new_developer_message\n",
565+
"\n",
566+
" final_messages: list[ChatMessage] | list[dict[str, str]] = messages\n",
567+
" if fs_issues.has_issues:\n",
568+
" mr_input = {\n",
569+
" \"NEW_DEVELOPER_MESSAGE\": final_prompt,\n",
570+
" \"ORIGINAL_MESSAGES\": _normalize_messages(messages),\n",
571+
" \"FEW_SHOT_ISSUES\": fs_issues.model_dump(),\n",
572+
" }\n",
573+
" mr_res = await Runner.run(fewshot_rewriter, json.dumps(mr_input))\n",
574+
" final_messages = mr_res.final_output.messages\n",
575+
"\n",
576+
" return {\n",
577+
" \"changes\": True,\n",
578+
" \"new_developer_message\": final_prompt,\n",
579+
" \"new_messages\": _normalize_messages(final_messages),\n",
580+
" \"contradiction_issues\": \"\\n\".join(cd_issues.issues),\n",
581+
" \"few_shot_contradiction_issues\": \"\\n\".join(fs_issues.issues),\n",
582+
" \"format_issues\": \"\\n\".join(fi_issues.issues),\n",
583+
" }"
584+
]
585+
},
586+
{
587+
"cell_type": "markdown",
588+
"metadata": {},
589+
"source": [
590+
"![Trace for the workflow](../images/optimizepromptfig3.png)"
587591
]
588592
},
589593
{

images/optimizepromptfig3.png

60.4 KB
Loading

0 commit comments

Comments
 (0)