Upload changes to photoz notebook (#1236)

camposandro · web-flow · commit 82f018f86942 · 2026-01-29T14:59:31.000-05:00
diff --git a/docs/tutorials/pre_executed/rubin_dp1_photoz.ipynb b/docs/tutorials/pre_executed/rubin_dp1_photoz.ipynb
@@ -15,7 +15,8 @@
     "\n",
     "In this tutorial, we will:\n",
     "- access a photo-z catalog derived from Rubin’s Data Preview 1 using LSDB (for data rights holders)\n",
-    "- access the same catalog using pandas or any other Parquet reader (for all users)"
+    "- access the same catalog using pandas or any other Parquet reader (for all users)\n",
+    "- reconstruct the `qp.Ensemble` from the PDF nested columns"
    ]
   },
   {
@@ -40,11 +41,11 @@
    "id": "ed4c15e1",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-01-07T21:57:50.902229Z",
-     "iopub.status.busy": "2026-01-07T21:57:50.901979Z",
-     "iopub.status.idle": "2026-01-07T21:57:56.750842Z",
-     "shell.execute_reply": "2026-01-07T21:57:56.750150Z",
-     "shell.execute_reply.started": "2026-01-07T21:57:50.902209Z"
+     "iopub.execute_input": "2026-01-29T19:36:21.186555Z",
+     "iopub.status.busy": "2026-01-29T19:36:21.186365Z",
+     "iopub.status.idle": "2026-01-29T19:36:29.010010Z",
+     "shell.execute_reply": "2026-01-29T19:36:29.009172Z",
+     "shell.execute_reply.started": "2026-01-29T19:36:21.186537Z"
     }
    },
    "outputs": [
@@ -606,11 +607,11 @@
    "id": "5bc65fbf",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-01-07T21:57:58.183201Z",
-     "iopub.status.busy": "2026-01-07T21:57:58.182757Z",
-     "iopub.status.idle": "2026-01-07T21:58:05.427771Z",
-     "shell.execute_reply": "2026-01-07T21:58:05.427212Z",
-     "shell.execute_reply.started": "2026-01-07T21:57:58.183166Z"
+     "iopub.execute_input": "2026-01-29T19:36:29.013280Z",
+     "iopub.status.busy": "2026-01-29T19:36:29.012961Z",
+     "iopub.status.idle": "2026-01-29T19:36:40.715891Z",
+     "shell.execute_reply": "2026-01-29T19:36:40.715202Z",
+     "shell.execute_reply.started": "2026-01-29T19:36:29.013247Z"
     }
    },
    "outputs": [
@@ -858,11 +859,11 @@
    "id": "e82467ec",
    "metadata": {
     "execution": {
-     "iopub.execute_input": "2026-01-07T21:58:10.409295Z",
-     "iopub.status.busy": "2026-01-07T21:58:10.409023Z",
-     "iopub.status.idle": "2026-01-07T21:58:10.563053Z",
-     "shell.execute_reply": "2026-01-07T21:58:10.562533Z",
-     "shell.execute_reply.started": "2026-01-07T21:58:10.409276Z"
+     "iopub.execute_input": "2026-01-29T19:36:40.718156Z",
+     "iopub.status.busy": "2026-01-29T19:36:40.717943Z",
+     "iopub.status.idle": "2026-01-29T19:36:40.869807Z",
+     "shell.execute_reply": "2026-01-29T19:36:40.869230Z",
+     "shell.execute_reply.started": "2026-01-29T19:36:40.718137Z"
     }
    },
    "outputs": [
@@ -899,16 +900,134 @@
     "plt.ylabel(\"kNN\")"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "1f86d413",
+   "metadata": {},
+   "source": [
+    "## 3. Reconstructing a QP Ensemble\n",
+    "\n",
+    "Here we demonstrate how to reconstruct a dictionary of `qp.Ensemble` objects from the HATS catalog.\n",
+    "\n",
+    "Based on the column suffix (e.g., interp, mixmod, norm, hist, or quantile lengths like 99, 20, 5), we parse the nested data fields and rebuild each ensemble using the appropriate qp generator."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "721b27c6-6dea-456d-8304-d8f297a1d60f",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-01-29T19:36:40.872245Z",
+     "iopub.status.busy": "2026-01-29T19:36:40.872024Z",
+     "iopub.status.idle": "2026-01-29T19:36:41.327487Z",
+     "shell.execute_reply": "2026-01-29T19:36:41.326924Z",
+     "shell.execute_reply.started": "2026-01-29T19:36:40.872219Z"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "import qp\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def hats_to_qp(df):\n",
+    "    \"\"\"Reconstruct `qp.Ensemble` objects from a partition.\"\"\"\n",
+    "    ensembles = {}\n",
+    "\n",
+    "    def extract(nf, subcol):\n",
+    "        return np.asarray([i[subcol] for i in nf])\n",
+    "\n",
+    "    for col in df.nested_columns:\n",
+    "        nf = df[col]\n",
+    "        data_dict = {}\n",
+    "        ens_type = col.split(\"_\")[-1]\n",
+    "\n",
+    "        match ens_type:\n",
+    "            case \"interp\":\n",
+    "                data_dict[\"yvals\"] = extract(nf, \"yvals\")\n",
+    "                data_dict[\"xvals\"] = np.asarray(nf.iloc[-1][\"xvals\"])\n",
+    "                gen_class = qp.interp_gen\n",
+    "            case \"mixmod\":\n",
+    "                data_dict[\"means\"] = extract(nf, \"means\")\n",
+    "                data_dict[\"stds\"] = extract(nf, \"stds\")\n",
+    "                data_dict[\"weights\"] = extract(nf, \"weights\")\n",
+    "                gen_class = qp.mixmod_gen\n",
+    "            case \"norm\":\n",
+    "                data_dict[\"loc\"] = extract(nf, \"loc\")\n",
+    "                data_dict[\"scale\"] = extract(nf, \"scale\")\n",
+    "                ensembles[col] = qp.Ensemble(qp.stats.norm, data=data_dict)\n",
+    "                gen_class = qp.stats.norm\n",
+    "            case \"hist\":\n",
+    "                data_dict[\"pdfs\"] = extract(nf, \"pdfs\")\n",
+    "                data_dict[\"bins\"] = np.linspace(0, 3, 301)\n",
+    "                gen_class = qp.hist_gen\n",
+    "            case \"99\" | \"20\" | \"5\":\n",
+    "                data_dict[\"locs\"] = extract(nf, \"locs\")\n",
+    "                data_dict[\"quants\"] = np.asarray(nf.iloc[-1][\"quants\"])\n",
+    "                gen_class = qp.quant_gen\n",
+    "            case _:\n",
+    "                continue\n",
+    "\n",
+    "        ensembles[col] = qp.Ensemble(gen_class, data=data_dict)\n",
+    "\n",
+    "    return ensembles"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4cb0f989-18e9-4ab0-bb0e-272de609e211",
+   "metadata": {},
+   "source": [
+    "Notice that computing the `qp.Ensemble` is very computationally expensive. For demonstration purposes we only used a handful of objects."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "7b48b4b0-208c-4346-8ab6-1017f2072593",
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2026-01-29T19:36:41.330241Z",
+     "iopub.status.busy": "2026-01-29T19:36:41.330050Z",
+     "iopub.status.idle": "2026-01-29T19:36:48.725613Z",
+     "shell.execute_reply": "2026-01-29T19:36:48.725023Z",
+     "shell.execute_reply.started": "2026-01-29T19:36:41.330224Z"
+    }
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'bpz_ens_interp': Ensemble(the_class=interp,shape=(5, 301)),\n",
+       " 'cmnn_ens_norm': Ensemble(the_class=norm,shape=(5, 1)),\n",
+       " 'dnf_ens_interp': Ensemble(the_class=interp,shape=(5, 301)),\n",
+       " 'fzboost_ens_interp': Ensemble(the_class=interp,shape=(5, 301)),\n",
+       " 'knn_ens_mixmod': Ensemble(the_class=mixmod,shape=(5, 10)),\n",
+       " 'lephare_ens_interp': Ensemble(the_class=interp,shape=(5, 301)),\n",
+       " 'tpz_ens_interp': Ensemble(the_class=interp,shape=(5, 301)),\n",
+       " 'gpz_ens_norm': Ensemble(the_class=norm,shape=(5, 1))}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "hats_to_qp(dp1_pz_catalog.head())"
+   ]
+  },
   {
    "cell_type": "markdown",
    "id": "7328abf3-7443-49a9-ac88-59b792f1b783",
    "metadata": {},
    "source": [
     "## About\n",
     "\n",
-    "**Authors**: Sandro Campos\n",
+    "**Authors**: Sandro Campos, Sarah Pelesky, Tianqing Zhang\n",
     "\n",
-    "**Last run**: Jan 7, 2026\n",
+    "**Last run**: Jan 29, 2026\n",
     "\n",
     "If you use `lsdb` for published research, please cite following [instructions](https://docs.lsdb.io/en/stable/citation.html)."
    ]