|
548 | 548 | },
|
549 | 549 | "metadata": {},
|
550 | 550 | "output_type": "display_data"
|
| 551 | + }, |
| 552 | + { |
| 553 | + "name": "stdout", |
| 554 | + "output_type": "stream", |
| 555 | + "text": [ |
| 556 | + "### Prompt Optimization Results - Coding Tasks\n", |
| 557 | + "\n", |
| 558 | + "| Metric | Baseline | Optimized | Δ (Opt − Base) |\n", |
| 559 | + "|----------------------------|---------:|----------:|---------------:|\n", |
| 560 | + "| Avg Time (s) | 7.906 | 6.977 | -0.929 |\n", |
| 561 | + "| Peak Memory (KB) | 3626.3 | 577.5 | -3048.8 |\n", |
| 562 | + "| Exact (%) | 100.0 | 100.0 | 0.0 |\n", |
| 563 | + "| Sorted (%) | 100.0 | 100.0 | 0.0 |\n", |
| 564 | + "| LLM Adherence (1–5) | 4.40 | 4.90 | +0.50 |\n", |
| 565 | + "| Code Quality (1–5) | 4.73 | 4.90 | +0.16 |\n" |
| 566 | + ] |
551 | 567 | }
|
552 | 568 | ],
|
553 | 569 | "source": [
|
|
573 | 589 | " judge_optimized=Path(\"results_llm_as_judge_optimized\")/\"judgement_summary.csv\",\n",
|
574 | 590 | ")\n",
|
575 | 591 | "\n",
|
576 |
| - "display(Markdown(md))" |
| 592 | + "display(Markdown(md))\n", |
| 593 | + "\n", |
| 594 | + "print(md)" |
577 | 595 | ]
|
578 | 596 | },
|
579 | 597 | {
|
|
619 | 637 | "We will run FailSafeQA evaluations via the helper script and compare Baseline vs Optimized prompts side by side."
|
620 | 638 | ]
|
621 | 639 | },
|
622 |
| - { |
623 |
| - "cell_type": "code", |
624 |
| - "execution_count": null, |
625 |
| - "id": "c5849f77", |
626 |
| - "metadata": {}, |
627 |
| - "outputs": [], |
628 |
| - "source": [] |
629 |
| - }, |
630 | 640 | {
|
631 | 641 | "cell_type": "code",
|
632 | 642 | "execution_count": 3,
|
|
834 | 844 | },
|
835 | 845 | {
|
836 | 846 | "cell_type": "code",
|
837 |
| - "execution_count": 1, |
| 847 | + "execution_count": 11, |
838 | 848 | "id": "c20097e6",
|
839 | 849 | "metadata": {},
|
840 | 850 | "outputs": [
|
|
845 | 855 | "\n",
|
846 | 856 | "**Compliance threshold:** ≥ 6\n",
|
847 | 857 | "\n",
|
848 |
| - "| Metric | Baseline | Optimized | Δ (Opt − Base) |\n", |
849 |
| - "|---|---:|---:|---:|\n", |
850 |
| - "| Robustness (avg across datapoints) | 0.320 | 0.540 | +0.220 |\n", |
851 |
| - "| Context Grounding (avg across datapoints) | 0.800 | 0.950 | +0.150 |\n", |
| 858 | + "| Metric | Baseline | Optimized | Δ (Opt − Base) |\n", |
| 859 | + "| ----------------------------------------- | -------- | --------- | -------------- |\n", |
| 860 | + "| Robustness (avg across datapoints) | 0.320 | 0.540 | +0.220 |\n", |
| 861 | + "| Context Grounding (avg across datapoints) | 0.800 | 0.950 | +0.150 |\n", |
852 | 862 | "\n",
|
853 | 863 | "_Source files:_ `results_failsafeqa.csv` · `results_failsafeqa.csv`"
|
854 | 864 | ],
|
|
858 | 868 | },
|
859 | 869 | "metadata": {},
|
860 | 870 | "output_type": "display_data"
|
| 871 | + }, |
| 872 | + { |
| 873 | + "name": "stdout", |
| 874 | + "output_type": "stream", |
| 875 | + "text": [ |
| 876 | + "## FailSafeQA — Summary\n", |
| 877 | + "\n", |
| 878 | + "**Compliance threshold:** ≥ 6\n", |
| 879 | + "\n", |
| 880 | + "| Metric | Baseline | Optimized | Δ (Opt − Base) |\n", |
| 881 | + "| ----------------------------------------- | -------- | --------- | -------------- |\n", |
| 882 | + "| Robustness (avg across datapoints) | 0.320 | 0.540 | +0.220 |\n", |
| 883 | + "| Context Grounding (avg across datapoints) | 0.800 | 0.950 | +0.150 |\n", |
| 884 | + "\n", |
| 885 | + "_Source files:_ `results_failsafeqa.csv` · `results_failsafeqa.csv`\n" |
| 886 | + ] |
861 | 887 | }
|
862 | 888 | ],
|
863 | 889 | "source": [
|
|
872 | 898 | ") -> str:\n",
|
873 | 899 | " d_r = robust_opt - robust_base\n",
|
874 | 900 | " d_g = ground_opt - ground_base\n",
|
| 901 | + "\n", |
| 902 | + " # Data rows\n", |
| 903 | + " rows = [\n", |
| 904 | + " [\"Metric\", \"Baseline\", \"Optimized\", \"Δ (Opt − Base)\"],\n", |
| 905 | + " [\"Robustness (avg across datapoints)\", f\"{robust_base:.3f}\", f\"{robust_opt:.3f}\", f\"{d_r:+.3f}\"],\n", |
| 906 | + " [\"Context Grounding (avg across datapoints)\", f\"{ground_base:.3f}\", f\"{ground_opt:.3f}\", f\"{d_g:+.3f}\"],\n", |
| 907 | + " ]\n", |
| 908 | + "\n", |
| 909 | + " # Calculate column widths for alignment\n", |
| 910 | + " col_widths = [max(len(str(row[i])) for row in rows) for i in range(len(rows[0]))]\n", |
| 911 | + "\n", |
| 912 | + " # Build table lines with padding\n", |
| 913 | + " lines = []\n", |
| 914 | + " for i, row in enumerate(rows):\n", |
| 915 | + " padded = [str(cell).ljust(col_widths[j]) for j, cell in enumerate(row)]\n", |
| 916 | + " lines.append(\"| \" + \" | \".join(padded) + \" |\")\n", |
| 917 | + " if i == 0: # after header\n", |
| 918 | + " sep = [\"-\" * col_widths[j] for j in range(len(row))]\n", |
| 919 | + " lines.append(\"| \" + \" | \".join(sep) + \" |\")\n", |
| 920 | + "\n", |
| 921 | + " table = \"\\n\".join(lines)\n", |
| 922 | + "\n", |
875 | 923 | " return f\"\"\"\n",
|
876 | 924 | "## FailSafeQA — Summary\n",
|
877 | 925 | "\n",
|
878 | 926 | "**Compliance threshold:** ≥ {threshold}\n",
|
879 | 927 | "\n",
|
880 |
| - "| Metric | Baseline | Optimized | Δ (Opt − Base) |\n", |
881 |
| - "|---|---:|---:|---:|\n", |
882 |
| - "| Robustness (avg across datapoints) | {robust_base:.3f} | {robust_opt:.3f} | {d_r:+.3f} |\n", |
883 |
| - "| Context Grounding (avg across datapoints) | {ground_base:.3f} | {ground_opt:.3f} | {d_g:+.3f} |\n", |
| 928 | + "{table}\n", |
884 | 929 | "\n",
|
885 | 930 | "_Source files:_ `{src_base}` · `{src_opt}`\n",
|
886 | 931 | "\"\"\".strip()\n",
|
887 | 932 | "\n",
|
888 |
| - "# Fill in with your reported numbers\n", |
| 933 | + "# Usage\n", |
889 | 934 | "md = build_markdown_summary_from_metrics(\n",
|
890 | 935 | " robust_base=0.320, ground_base=0.800,\n",
|
891 | 936 | " robust_opt=0.540, ground_opt=0.950,\n",
|
|
894 | 939 | " src_opt=\"results_failsafeqa.csv\",\n",
|
895 | 940 | ")\n",
|
896 | 941 | "\n",
|
897 |
| - "display(Markdown(md))" |
| 942 | + "# Notebook pretty\n", |
| 943 | + "display(Markdown(md))\n", |
| 944 | + "\n", |
| 945 | + "print(md)" |
898 | 946 | ]
|
899 | 947 | },
|
900 | 948 | {
|
|
0 commit comments