|
15 | 15 | "\n",
|
16 | 16 | "### HealthBench\n",
|
17 | 17 | "\n",
|
18 |
| - "This cookbook evaluates and improves model performance on a focused subset of [HealthBench](https://openai.com/index/healthbench/), a benchmark suite for medical QA. This guide walks through how to configure the datasets, define evaluation rubrics, and fine-tune model behavior using reinforcement signals derived from custom graders.\n", |
| 18 | + "This cookbook evaluates and improves model performance on a focused subset of [HealthBench](https://openai.com/index/healthbench/), a benchmark suite for medical QA. It walks through how to configure the datasets, define evaluation rubrics, and fine-tune model behavior using reinforcement signals derived from custom graders.\n", |
19 | 19 | "\n",
|
20 | 20 | "HealthBench is a comprehensive evaluation benchmark developed to assess the performance of large language models on healthcare-related question answering. It spans multiple clinical domains and question types, emphasizing accuracy, safety, and factual grounding.\n",
|
21 | 21 | "\n",
|
|
155 | 155 | },
|
156 | 156 | {
|
157 | 157 | "cell_type": "code",
|
158 |
| - "execution_count": null, |
| 158 | + "execution_count": 24, |
159 | 159 | "id": "7bdab335",
|
160 | 160 | "metadata": {},
|
161 |
| - "outputs": [], |
| 161 | + "outputs": [ |
| 162 | + { |
| 163 | + "name": "stdout", |
| 164 | + "output_type": "stream", |
| 165 | + "text": [ |
| 166 | + "Counter(data['criteria_met']): Counter({False: 44, True: 9})\n", |
| 167 | + "Counter(filtered_data['criteria_met']): Counter({False: 17, True: 6})\n" |
| 168 | + ] |
| 169 | + } |
| 170 | + ], |
162 | 171 | "source": [
|
163 | 172 | "# let's read in our results file from json\n",
|
164 | 173 | "with open(INPUT_PATH) as f:\n",
|
|
203 | 212 | },
|
204 | 213 | {
|
205 | 214 | "cell_type": "code",
|
206 |
| - "execution_count": null, |
| 215 | + "execution_count": 26, |
207 | 216 | "id": "ed909ae9",
|
208 | 217 | "metadata": {},
|
209 |
| - "outputs": [], |
| 218 | + "outputs": [ |
| 219 | + { |
| 220 | + "name": "stderr", |
| 221 | + "output_type": "stream", |
| 222 | + "text": [ |
| 223 | + "100%|██████████| 23/23 [02:28<00:00, 6.48s/it]\n" |
| 224 | + ] |
| 225 | + }, |
| 226 | + { |
| 227 | + "data": { |
| 228 | + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYYAAAEuCAYAAACKz7VmAAAAOnRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjEwLjYsIGh0dHBzOi8vbWF0cGxvdGxpYi5vcmcvq6yFwwAAAAlwSFlzAAAPYQAAD2EBqD+naQAAJRhJREFUeJzt3QmcTfX/+PH32BkzlmEs2caSfU18Q98sfVOWSItKWUtFIWVLg9Ey+JZ8SSrfUAmVEPpGskbEhLKUJRMKjbLMwljP//H+9LvznzMzmDvOnbnL6/l4nMfMPfeecz73Gud9P+s7yLIsSwAA+D+5XL8AAKAIDAAAGwIDAMCGwAAAsCEwAABsCAwAABsCAwDAhsAAALAhMAAAbAgMAAAbAgMAZIN169ZJx44dpWzZshIUFCSLFi1Kee7ChQsybNgwqVu3rgQHB5vXdO/eXY4cOZIjZSUwAEA2SEpKkvr168vUqVPTPXfmzBnZunWrREZGmp8LFiyQPXv2yN13350jZQ1iET0AyF5BQUGycOFC6dy58xVfs2XLFmnSpIkcPHhQKlSokK3lo8YAAF7o9OnTJoAULVo0269NYAAAL5OcnGz6HB566CEJDQ3N9usTGADAi1y4cEEeeOAB0Vb+adOm5UgZ8uTIVQEAVwwK2q+watWqHKktKAIDAHhRUNi3b5+sXr1awsLCcqwsBAYAyAaJiYmyf//+lMexsbGyfft2KV68uJQpU0buu+8+M1R16dKlcunSJTl27Jh5nT6fL1++bC0rw1UBIBusWbNGWrVqlW5/jx49ZMyYMRIREZHhcVp7aNmypWQnvw8Mly9fNrMHQ0JCzNAvAAhElmVJQkKCmVWdK1euwG5K0qBQvnz5nC4GAHiFw4cPS7ly5QI7MGhNwfVh5FQPPwDktPj4ePMl2XVPDOjA4Go+0qBAYAAQ6IIy0aTOBDcAgA2BAQBgQ2AAAHhPYPClxBUAEChyNDD4UuIKAAgUXjPBzVOJK3SIVpEiRcza5oxKAhCo4t24F/pUH0NOJq4AgECRx98SV5w7d85sqaOka2kM3QAgEF124/6Xx98SV0RHR0tUVFS6/cePHzfBBQCyy/xf/v5i6qT7qmStSVzXSfKbwOBu4ooRI0bI4MGD000DL1myJH0MALJV4pHcjp8zPDxreRoKFCjgH4EhK4kr8ufPb7a0dDXBa60oCACO8sCKzlm9j7lzXI4GBl9KXAEAgSJHA0NMTIwtcYWrCciVuGLx4sXmcYMGDXI8cQUABIocDQx6c7/aNAovmWIBAAGFRncAgA2BAQBgQ2AAANgQGAAANgQGAIANgQEAYENgAADYEBgAADYEBgCADYEBAGBDYAAA2BAYAAA2BAYAgA2BAQBgQ2AAANgQGAAANgQGAIANgQEAYENgAADYEBgAADYEBgCADYEBAGBDYAAA2BAYAAA2BAYAwPUFhmXLlsn69etTHk+dOlUaNGggDz/8sJw8edKtc61bt046duwoZcuWlaCgIFm0aJHtecuyZNSoUVKmTBkpWLCg3H777bJv3z53iwwA8GRgGDJkiMTHx5vfd+zYIc8995y0a9dOYmNjZfDgwW6dKykpSerXr2+CS0YmTJggkydPlrffflu+++47CQ4OlrZt20pycrJ4i4SEBBk0aJBUrFjRBK9mzZrJli1bcrpYAJBledw9QANArVq1zO+fffaZdOjQQV599VXZunWrCRDuuOuuu8yWEa0tTJo0SV588UXp1KmT2ffBBx9IqVKlTM3iwQcfFG/w2GOPyc6dO+XDDz80NZ/Zs2ebms3u3bvlhhtuyOniAYDnawz58uWTM2fOmN+//vprueOOO8zvxYsXT6lJOEED0LFjx8xN1qVIkSLStGlT2bhxo3iDs2fPmuCoNZt//vOfUrVqVRkzZoz5OW3atJwuHgBkT42hRYsWpsmoefPmsnnzZvn444/N/r1790q5cuXEKRoUlNYQUtPHrucycu7cObO5uILV5cuXzeak8+fPy6VLl0ywTH1ubVLSfhinrwfAx1iW46fM6n3FnePcDgxvvvmm9OvXT+bPn2++FbuaS7788ku58847JadFR0dLVFRUuv3Hjx/3SN9E48aNZfTo0VKyZEmzLVy40NRoIiIiJC4uzvHrAfAdhc8614riEhd3Kcv9oR4LDBUqVJClS5em2//GG2+Ik0qXLm1+/vHHH2ZUkos+1lFQVzJixAhbJ7jWGMqXL29u2qGhoeK0OXPmmH6Ghg0bSu7cuaVRo0am/0P7XMLDwx2/HgDfkXgkt+PnDA8Py9JxBQoU8FxgUL/88ovMnDnT/PzPf/5jboBaY9CgUbt2bXGCfuPW4LBy5cqUQKA3eR2d9NRTT13xuPz585strVy5cpnNadWqVZO1a9eaEVZaPg1iXbt2lcqVK3vkegB8SFCQ46fM6n3FnePcvoLeBOvWrWtu0AsWLJDExESz/4cffjBNKu7QY7dv3242V4ez/n7o0CEzr0GHgb788suyePFiMzS2e/fuZuRP586dxdvoUFoNCjqXY/ny5SkjqQDA17hdYxg+fLi5WWtzTUhISMr+1q1bm/4Hd8TExEirVq1SHruagHr06CGzZs2SoUOHmm/iffv2lVOnTpmOb51g506VyNM0COjQ2urVq8v+/fvNPI8aNWpIr169crpoAJA9gUG/uWu7elranPTnn3+6da6WLVuam+qVaK1h7NixZvNWp0+fNv0av/32mxmye++998orr7wiefPmzemiAUD2BIaiRYvK0aNHTR9Aatu2bQvICV0PPPCA2QDAX7jdx6AjboYNG2bmEug3eh0bu2HDBnn++edNHwAAIMACgy5/oW3oOgRUO491eQyd9atrBOnyFQCAAGtK0lm+06dPl8jISLNGkAYHHcOvwzYBAL4vS/MYlM5Z0A0AEICBwZ3ltCdOnHg95QEA+EJg0BFHmaGd0QCAAAgMq1ev9nxJAABe4boW8zl8+LDZAAABHBguXrxoRiRp0pxKlSqZTX/XoaoXLlzwTCkBAN47KumZZ54xi+dp1rJbbrnF7NP8A5q57K+//iJzGQAEWmDQdZLmzZtny9Vcr149M+HtoYceIjAAQKAFBs11oM1HaenaSTr5zd+M2+bewoCZMbxhCcfPCQA51sfw9NNPy0svvWTLq6y/64qi+hwAIMBqDDqnQbOqlStXTurXr5+SpOf8+fPSpk0b6dKlS8prtS8CABAAy25rzoHUtH8BABCggUFzPQMA/BfZ6gEA11dj0LkKo0aNMstkxMXFmUQ9qZ04ccLdUwIAfDkwPProoybpfZ8+faRUqVIsnAcAgR4YvvnmG1m/fn3KiCQAQID3MWhaz7Nnz3qmNAAA3wsMb731lowcOVLWrl1r+hvi4+NtGwAgAOcxaABo3bq1bb9lWaa/4dKlS06WDwDg7YGhW7dukjdvXrOYHp3PAOB/3A4MO3fuNMtiVK9eXTxNax+6nPfs2bPl2LFjUrZsWenZs6fJ/UBAAgAvCQyNGzc2WduyIzCMHz/eLOP9/vvvS+3atSUmJkZ69eplEgMNGDDA49cHgECUpUQ9AwcOlCFDhkjdunVNs1JqmpvBKd9++6106tRJ2rdvbx7rct9z586VzZs3O3YNAMB1BoauXbuan717907Zp806nuh8btasmbz77ruyd+9eufHGG80qrjqHYuLEiY5dAwBwnYEhNjZWssvw4cPNCCidO5E7d24TdDTvg3aAX4nmhkidK8I1hFaX7ki7fEemWJY4LUvlAOB7vOj+4c5xbgeGihUrSnb55JNP5KOPPjIjoLSPYfv27TJo0CDTCd2jR48Mj4mOjpaoqKh0+48fPy7Jyclul6HwWefnZsTFMaQXCASFvej+kZCQkOnXBlnaBpQFu3fvlkOHDpkEPandfffd4hTN86C1hv79+6fse/nll80opZ9//jnTNQY9z8mTJyU0NNTtMkzY/pc4bWiDMMfPCcD7TPCi+4feC4sVKyanT5++5r3Q7RrDgQMH5J577pEdO3ak9C0o1/BRJ/sYzpw5I7ly2Sdna5PS1apEmpNat7T0PGnPlSkeGBabpXIA8D1B3nP/cOc4t6+gI5IiIiLMktuFChWSXbt2ybp168ww1jVr1oiTOnbsaPoUvvjiC/n1119l4cKFpuNZAxMAwDPcrjFs3LhRVq1aJSVKlEj5Ft6iRQvTtq9zC3Tym1OmTJkikZGR0q9fPxOItG/hiSeeMPkgAABeEhi0qSgkJMT8rsHhyJEjZrKbdkrv2bPH0cLpdSZNmmQ2AICXBoY6deqY+QTanNS0aVOZMGGC5MuXz8w3qFy5smdKCQDw3sCg6xQlJSWZ38eOHSsdOnSQW2+9VcLCwuTjjz/2RBkBAN4cGNq2bZvye9WqVc2wUc3zrMOgWNgOAHyf26OSdKJYWsWLFzdBQYewAgACLDDownk6fDSt1157TZo0aeJUuQAAvhIYBg8eLPfee6889dRTJvfz77//Lm3atDGd0Lp0BQAgwALD0KFDzVyGb775xiyxrZvONP7xxx+ZeAYAfiBLc6u101mHrepsZF1/Q5fiLl26tPOlAwB4f2DYsGGDqSXs27fP1BI0w5om79HgoAvVAQACLDC0bt3aBIFNmzZJzZo15bHHHjPLYOhKq9oxDQAIsHkMX331ldx22222fVWqVDE1CV3wDgAQYDWGtEEh5US5cpkF7wAAARIY2rVrZxI8uIwbN05OnTqV8vivv/6SWrVqOV9CAIB3Bobly5fbMqO9+uqrZikMl4sXLzq+uioAwIsDQ9oMoFnMCAoA8HLkmAQAZC0w6CJ5aVdPZTVVAAjg4aradNSzZ0+z/IVKTk6WJ598UoKDg83j1P0PAIAACAw9evSwPX7kkUfSvaZ79+7OlAoA4P2BYebMmZ4tCQDAK9D5DACwITAAAGwIDAAAGwIDAMD9wNCoUaOUXAtjx46VM2fOZOYwAIC/BoaffvpJkpKSzO9RUVGSmJgo2UVzSuvQ2LCwMClYsKDJ+RATE5Nt1weAQJOp4aoNGjSQXr16SYsWLcxEt9dee00KFy6c4WtHjRrlWOG0ltK8eXNp1aqVfPnll1KyZEmTOa5YsWKOXQMAkIXAMGvWLBk9erQsXbrULIOhN+k8edIfqs85GRjGjx8v5cuXt82hiIiIcOz8AIAsBobq1avLvHnzUhLyrFy5UsLDw8XTFi9eLG3btpX7779f1q5dKzfccIP069dPHn/8cY9fGwACldupPS9fvizZ5cCBAzJt2jQZPHiwvPDCC7JlyxYZMGCA5MuXL90SHS66ZlPqdZvi4+NTyp2lsntgefHs/AwB5CDLe+4f7hzndmBQv/zyi0yaNMl0SivN3DZw4ECT+9lJ+kYaN25skgKphg0bys6dO+Xtt9++YmCIjo42HeRpHT9+3Cz8567CZ/8OLE6Ki7vk+DkBeJ/CXnT/SEhI8Fxg0Exud999t+mQ1o5htWHDBqldu7YsWbJE/vWvf4lTypQpky5daM2aNeWzzz674jEjRowwNYzUNQbtp9CO69DQULfLkHgktzgtPDzM8XMC8D6JXnT/KFCggOcCw/Dhw+XZZ581OZ/T7h82bJijgUEDT9p0oXv37pWKFSte8RhdFty1NHhq2jeim9s8kHMiS+UA4HuCvOf+4c5xbl9Bm4/69OmTbn/v3r1l9+7d4iQNQJs2bTJNSfv375c5c+bIu+++K/3793f0OgCA6wgM2iSzffv2dPt1n9MjlW6++WZZuHChzJ07V+rUqSMvvfSS6dvo1q2bo9cBAFxHU5IOFe3bt68ZMdSsWbOUPgadc5C6bd8pHTp0MBsAwEsDQ2RkpISEhMjrr79uOnpV2bJlZcyYMWYoKQAgwAKDzm7Wtn/dXMOfNFAAAPxDluYxuBAQAMD/MG4SAGBDYAAA2BAYAABZDwwXLlyQNm3amJwIAAD/5FZgyJs3r/z444+eKw0AwPeakjTN5nvvveeZ0gAAfG+46sWLF2XGjBny9ddfy0033STBwcG25ydOnOhk+QAA3h4YNB9Co0aNUlY6TTv5DQAQYIFh9erVnikJAMC3h6vqMtiatOfs2bPmseWBFHYAAB8IDH/99ZcZsnrjjTdKu3bt5OjRo2a/5mh47rnnPFFGAIA3BwZdPE+HrR46dEgKFSqUsr9r166ybNkyp8sHAPD2PoavvvrKNCGVK1fOtr9atWpy8OBBJ8sGAPCFGkNSUpKtpuBy4sSJDHMtAwD8PDDceuut8sEHH9iGqF6+fFkmTJggrVq1crp8AABvb0rSAKCdzzExMXL+/HkZOnSo7Nq1y9QYNMUnACDAagx16tQxE9tatGghnTp1Mk1LXbp0kW3btkmVKlU8U0oAgHdncCtSpIiMHDnS+dIAAHwzMJw8edIspPfTTz+Zx7Vq1ZJevXpJ8eLFnS4fAMDbm5LWrVsnlSpVksmTJ5sAoZv+HhERYZ4DAARYjaF///5mMtu0adMkd+7cZt+lS5ekX79+5rkdO3Z4opwAAG+tMegaSbr0hSsoKP198ODB5jkAQIAFBl1y29W3kJruq1+/vnjSuHHjzLyJQYMGefQ6ABDIMtWUlDqd54ABA2TgwIGmdvCPf/zD7Nu0aZNMnTrV3Lg9ZcuWLfLOO+9IvXr1PHYNAEAmA0ODBg3MN/XUS2vrxLa0Hn74YdP/4LTExETp1q2bTJ8+XV5++WXHzw8AcDMwxMbGSk7STu327dvL7bffTmAAAG8IDBUrVpScMm/ePNm6datpSsqMc+fOmc0lPj7e/NT1nHRzmwcSEGWpHAB8j+U99w93jsvSBLcjR47I+vXrJS4uLt3FtA/CKYcPHzb9GStWrJACBQpk6pjo6GiJiopKt//48eOSnJzsdhkKn/07sDgpLu6S4+cE4H0Ke9H9IyEhIdOvDbLczMk5a9YseeKJJyRfvnwSFhZm+h5SThYUJAcOHBCnLFq0SO655x7b0FidM6HXyZUrl6kZpH7uSjWG8uXLm4l4oaGhbpdhwva/xGlDG4Q5fk4A3meCF90/9F5YrFgxOX369DXvhW7XGCIjI2XUqFEyYsQIc3P2JF3FNe2EOV16o0aNGjJs2LB0QUFpToiM8kJoWbNU3lSBzyme/twAeIkg77l/uHOc24HhzJkz8uCDD2bLzS0kJMSs5ppacHCwqamk3Q8AcIbbd/c+ffrIp59+6tDlAQDexu0ag3budujQQZYtWyZ169aVvHnz2p6fOHGieNKaNWs8en4ACHRZCgzLly+X6tWrm8dpO58BAAEWGF5//XWZMWOG9OzZ0zMlAgD4Vh+Djvhp3ry5Z0oDAPC9wKATzqZMmeKZ0gAAfK8pafPmzbJq1SpZunSp1K5dO13n84IFC5wsHwDA2wND0aJFpUuXLp4pDQDA9wLDzJkzPVMSAIBXYG0GAMD11RgiIiKuOl/ByUX0AAA+EBjS5lu+cOGCbNu2zcyEHjJkiJNlAwD4QmDQ4aoZ0ZzPMTExTpQJAOAPfQx33XWXfPbZZ06dDgDg64Fh/vz5Urx4cadOBwDwlaakhg0b2jqfNQHcsWPHTOrMt956y+nyAQC8PTB07tzZ9lgT9pQsWVJatmxpMqsBAAIsMIwePdozJQEAeAUmuAEAslZj0CajayXi0ecvXryY2VMCAHw5MCxcuPCKz23cuFEmT54sly9fdqpcAABvDwydOnVKt2/Pnj0yfPhwWbJkiXTr1k3Gjh3rdPkAAL7Qx3DkyBF5/PHHpW7duqbpaPv27fL+++9LxYoVnS8hAMB7A8Pp06dl2LBhUrVqVdm1a5esXLnS1Bbq1KnjuRICALyzKWnChAkyfvx4KV26tMydOzfDpiUAQAAFBu1LKFiwoKktaLORbhkhtScABEhg6N69+zWHqwIAAigwzJo1S7JbdHS0qYH8/PPPprbSrFkz05xVvXr1bC8LAAQKr575vHbtWunfv79s2rRJVqxYYZIC3XHHHZKUlJTTRQMyZd26ddKxY0cpW7asqXEvWrQop4vkc/gMs59XBwbNCtezZ0+pXbu21K9f39RaDh06JN9//31OFw3IFP0So3+7msgKWcNn6AOL6OUkHS6ryPsAX6EJrHRD1vEZZj+fCQy63Ibmm27evPlV502cO3fObC7x8fEpx2dpyQ7LEqexdEjgyvLfIXzzM7S85/7hznE+Exi0r2Hnzp2yfv36a3ZYR0VFpduviYSSk5Pdvm7hs38HFifFxV1y/JzwnVpvXFxcThfDp/nSZ1jYi+4fCQkJ/hUYnn76aVm6dKnphCpXrtxVXztixAgZPHiwrcZQvnx5k0woNDTU7WsnHsktTgsPD3P8nPANRYoUkfDw8Jwuhk/zpc8w0YvuHwUKFPCPwKBpQ5955hmzsuuaNWskIiLimsfkz5/fbBktG66b2zwwdyNL5YBfyPLfIXzzMwzynvuHO8fl8fbmozlz5sjnn38uISEhJre06xuDzmsAADjPqwPDtGnTzE/NJ53azJkzzTBWwNslJibK/v37Ux7Hxsaa1Yh1ZF2FChVytGy+gs8w+3l9UxLgy2JiYqRVq1Ypj139Xz169MiR1QR8EZ9h9vPqwAD4Oq3t8gXn+vAZZj8f6cEBAGQXAgMAwIbAAACwITAAAGwIDAAAGwIDAMCGwAAAsCEwAABsCAwAABsCAwDAhsAAALBhrSTgOozb9qfj5xzesIQEEk98hoH4OTqJGgMAwIbAAACwITAAAGwIDAAAGwIDAMCGwAAAsCEwAABsCAwAABsCAwDAhsAAALAhMAAAbAgMAADfCwxTp06VSpUqSYECBaRp06ayefPmnC4SAPgtrw8MH3/8sQwePFhGjx4tW7dulfr160vbtm0lLi4up4sGAH7J6wPDxIkT5fHHH5devXpJrVq15O2335ZChQrJjBkzcrpoAOCXvDownD9/Xr7//nu5/fbbU/blypXLPN64cWOOlg0A/JVXJ+r5888/5dKlS1KqVCnbfn38888/Z3jMuXPnzOZy+vRp8/PUqVNy+fJlt8uQnBAvTjt1yqs/driBvw/v/Ay95XNM9qK/j/j4v8tiWdY1X5vzn5zDoqOjJSoqKt3+ihUrirdIXzrg/+Pvwxn++jlGXefxCQkJUqRIEd8NDCVKlJDcuXPLH3/8Yduvj0uXLp3hMSNGjDCd1S5aSzhx4oSEhYVJUFCQR8qpkbh8+fJy+PBhCQ0NFX/ir+/NX9+XP7833tf10ZqCBoWyZcte87VeHRjy5csnN910k6xcuVI6d+6ccqPXx08//XSGx+TPn99sqRUtWjRbyqv/qP70BxsI781f35c/vzfeV9Zdq6bgE4FB6bf/Hj16SOPGjaVJkyYyadIkSUpKMqOUAADO8/rA0LVrVzl+/LiMGjVKjh07Jg0aNJBly5al65AGAARIYFDabHSlpiNvoE1XOgEvbROWP/DX9+av78uf3xvvK/sEWZkZuwQACBhePcENAJD9CAwAABsCAwDAhsAAALAhMDjA3/JF6LIiN998s4SEhEh4eLiZXLhnzx7xR+PGjTMz4gcNGiS+7vfff5dHHnnEzPIvWLCg1K1bV2JiYsTX6XppkZGREhERYd5XlSpV5KWXXsrUmj/eZN26ddKxY0cz81j/5hYtWmR7Xt+PDssvU6aMeZ+6WOi+fftypKwEhuvkj/ki1q5dK/3795dNmzbJihUr5MKFC3LHHXeYiYX+ZMuWLfLOO+9IvXr1xNedPHlSmjdvLnnz5pUvv/xSdu/eLa+//roUK1ZMfN348eNl2rRp8uabb8pPP/1kHk+YMEGmTJkiviQpKcncH/SLZEb0PU2ePNmkFvjuu+8kODjY3EuSk5OzvawapXAdmjRpYvXv3z/l8aVLl6yyZcta0dHRlr+Ii4vTr2bW2rVrLX+RkJBgVatWzVqxYoV12223WQMHDrR82bBhw6wWLVpY/qh9+/ZW7969bfu6dOlidevWzfJVImItXLgw5fHly5et0qVLW//+979T9p06dcrKnz+/NXfu3GwvHzWG6xAo+SJcS5cXL15c/IXWiNq3b2/7t/NlixcvNsvG3H///ab5r2HDhjJ9+nTxB82aNTPro+3du9c8/uGHH2T9+vVy1113ib+IjY01Kzuk/nvUdY20aTon7iU+MfPZn/JF+BpdtFDb37WZok6dOuIP5s2bZ5r9tCnJXxw4cMA0t2iz5gsvvGDe24ABA8xClLrWmC8bPny4WYG0Ro0aZrVl/T/3yiuvSLdu3cRfHDt2zPzM6F7iei47ERhwzW/WO3fuNN/Q/IEubTxw4EDTd6KDBfyFBnCtMbz66qvmsdYY9N9N26t9PTB88skn8tFHH8mcOXOkdu3asn37dvNlRTtxff29eSuakrI5X4Qv0fWpli5dKqtXr5Zy5cqJP9CmPx0Y0KhRI8mTJ4/ZtLNdO/30d/026ot0JIvmRE+tZs2acujQIfF1Q4YMMbWGBx980Iy0evTRR+XZZ581o+f8Ren/u194y72EwOBQvggXV76IW265RXyV9o1pUFi4cKGsWrXKDBP0F23atJEdO3aYb52uTb9pa7OE/q6B3hdpU1/aIcXaJu9NmQuz6syZM6bvLjX9d8pKql5vFRERYQJA6nuJNp/p6KQcuZdke3e3n5k3b54ZOTBr1ixr9+7dVt++fa2iRYtax44ds3zVU089ZRUpUsRas2aNdfTo0ZTtzJkzlj/yh1FJmzdvtvLkyWO98sor1r59+6yPPvrIKlSokDV79mzL1/Xo0cO64YYbrKVLl1qxsbHWggULrBIlSlhDhw61fG0k3LZt28ymt96JEyea3w8ePGieHzdunLl3fP7559aPP/5oderUyYqIiLDOnj2b7WUlMDhgypQpVoUKFax8+fKZ4aubNm2yfJn+0Wa0zZw50/JH/hAY1JIlS6w6deqYLyo1atSw3n33XcsfxMfHm38f/T9WoEABq3LlytbIkSOtc+fOWb5k9erVGf6/0sDnGrIaGRlplSpVyvwbtmnTxtqzZ0+OlJVltwEANvQxAABsCAwAABsCAwDAhsAAALAhMAAAbAgMAAAbAgMAwIbAAIhkmFErM2bNmiVFixb1SJmAnEJggE/r2bOnuanrptnLdM2ZoUOHZlvWq65du6bkCcgKXbRP04vqktKazlFzXuga/P/9738dLSfgDpbdhs+78847ZebMmSYFqa6eqksxa6DQFJCepNfTm7luWRUVFWXSi2raSl3MTxdO0zzNmqrTkwmmdAFI4EqoMcDn5c+f36xMWb58eencubPJgqX5FlwqVaokkyZNsh3ToEEDGTNmjG3f0aNHTVYwvdFXrlxZ5s+fn/Lcr7/+aoKN5vi+7bbbTC4HzRGQUVPSkiVL5Oabbzav0aXZ77nnnqtmXuvXr5/JvKa1Hc0J3KdPH3n++edTXqOriGo+4KpVq5r3WqFCBZOoxkVXi23durUpd1hYmPTt21cSExNttSr9XPQYzWFQvXr1lNwUDzzwgCm/1lQ6depk3idAYIBf0eQ03377bZa+EUdGRsq9995rUkfqMty6/r8mn09N8wJooh/dr4na0/riiy9MIGjXrp1s27bNLKPcpEmTK15TA5oubX78+PErvmbEiBGmuUnLt3v3bpOwxpXpSxPMazmKFStmsrZ9+umn8vXXX5tl01PTcuiy3BowNceG1nb0uJCQEPnmm29kw4YNUrhwYVP70hoFAlyOLN0HOERXpsydO7cVHBxsVqTUP+lcuXJZ8+fPT3lNxYoVrTfeeMN2XP369a3Ro0enPNbjnnzySdtrmjZtapYgV7rcs75m0qRJttfoirO6RLnLLbfc4laS+l27dlk1a9Y0Za5bt671xBNPWP/73/9sK4vq+5o+fXqGx+sKqsWKFbMSExNT9n3xxRfmfK6l3/Uz0hU7U69G+uGHH1rVq1c3K3q66PMFCxa0li9fnunywz9RY4DPa9WqlUmyo0lNtH+hV69e5pu/u9ImRNHHaWsM2g9wNVoOTQaUWZp1TWs5mzZtkt69e5vsch07dpTHHnvMPK/XP3fu3BXPqc9r81NwcLAtaY82P6VO3KOZz1LXorRWtH//flNj0JqCbtqcpJ32v/zyS6bLD/9E5zN8nt4Utf1dzZgxw9wo33vvPdNWrzT7V9rV5bUpJavXupqsdERr+bRPQjfNZTx79myTvnLkyJHX1bF9tXJrH4RmH9R+krRKlizpyDXhu6gxwK/oTfaFF16QF198Uc6ePZtyo9OOZRcd+RMbG5vuWP3Wnvax5k12R7169WzpGbPClbtZ+w+qVatmgsOVzqnl02//+loX7S/Qz8HVyZwRzXm9b98+CQ8PN0E19VakSJHrKj98H4EBfkdH+GhO4KlTp5rHOmLnww8/NJ2sOoJHm5syyu2sHbda49B5CaNHj5bNmzen68S9Fj1u7ty55qc28+j1rjZs9r777pM33njDNIMdPHhQ1qxZI/3795cbb7zRzG3QkU3Dhg0zczM++OAD08yjAUtrREo7yfU1+p60SWr16tXyzDPPmBqHq4M6I3qcjpjSkUj6uWig1GsPGDBAfvvtN7feM/wPgQF+J0+ePOaGrkM89Zu0jurRIaYdOnSQ9u3bm6GbVapUyXBOwbx588y3fr0J6w3e9e09s1q2bGkCjA5D1SGxGpQ0wFyJjgzS4a3ar6DBQG/wGhC++uor8z6UjkZ67rnnZNSoUaaGoJPqtC9CFSpUSJYvXy4nTpwwTVEaaLQ/QudFXI0et27dOjP0tUuXLua82vSmfQyhoaFuvWf4H1J7AgBsqDEAAGwIDAAAGwIDAMCGwAAAsCEwAABsCAwAABsCAwDAhsAAALAhMAAAbAgMAAAbAgMAwIbAAACQ1P4f/9gHq2YoLk8AAAAASUVORK5CYII=", |
| 229 | + "text/plain": [ |
| 230 | + "<Figure size 400x300 with 1 Axes>" |
| 231 | + ] |
| 232 | + }, |
| 233 | + "metadata": {}, |
| 234 | + "output_type": "display_data" |
| 235 | + } |
| 236 | + ], |
210 | 237 | "source": [
|
211 | 238 | "def create_prompt(justification, criteria_met, rubric=CHALLENGING_RUBRIC):\n",
|
212 | 239 | " prompt = f\"\"\"\n",
|
|
254 | 281 | "score_counts = Counter(index_to_score.values())\n",
|
255 | 282 | "scores = sorted(score_counts.keys())\n",
|
256 | 283 | "\n",
|
257 |
| - "plt.figure(figsize=(10, 6))\n", |
| 284 | + "plt.figure(figsize=(4, 3))\n", |
258 | 285 | "plt.bar(scores, [score_counts[s] for s in scores], color='skyblue')\n",
|
259 |
| - "plt.xlabel('Score')\n", |
| 286 | + "plt.xlabel('Rubric Score')\n", |
260 | 287 | "plt.ylabel('Number of Examples')\n",
|
261 |
| - "plt.title('Distribution of Justification Scores')\n", |
262 | 288 | "plt.xticks([0, 2, 4, 6, 8, 10])\n",
|
263 | 289 | "plt.grid(axis='y', alpha=0.3)\n",
|
264 | 290 | "plt.tight_layout()\n",
|
|
0 commit comments