gcperformance
diff --git a/‎notebooks/drf-yrs.ipynb‎
Lines changed: 41 additions & 44 deletions b/‎notebooks/drf-yrs.ipynb‎
Lines changed: 41 additions & 44 deletions
diff --git a/‎notebooks/experiment-template.ipynb‎
Lines changed: 1 addition & 3 deletions b/‎notebooks/experiment-template.ipynb‎
Lines changed: 1 addition & 3 deletions
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 2,
    "id": "fd2d413a-666f-49d4-b86c-7d6898a496a5",
    "metadata": {},
    "outputs": [],
@@ -29,7 +29,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 3,
    "id": "45467185",
    "metadata": {},
    "outputs": [],
@@ -53,21 +53,8 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
-   "id": "2373cc17",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "drf.loc[(drf['program_id']=='BRB01')]\n",
-    "# drf['valid_plan'] = (drf['report_yr'] > latest_si_fy)\n",
-    "\n",
-    "# drf[(drf['org_id']==130) & (drf['program_id']=='ISS02') & (drf['planned_actual']=='planned') & (drf['report_yr'] > latest_si_fy)]\n"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a5c0a639",
+   "execution_count": 4,
+   "id": "e9dde87f",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -76,6 +63,8 @@
     "drf = standardize_column_names(drf)\n",
     "drf['fiscal_yr'] = drf['fiscal_yr'].apply(clean_fiscal_yr)\n",
     "\n",
+    "si['org_id']=pd.to_numeric(si['org_id'].astype(int), errors='coerce')\n",
+    "\n",
     "# Define columns related to planned and actual measures: spending and FTEs\n",
     "# These columns will be unpivoted / melted\n",
     "fte_spend_cols = [\n",
@@ -111,44 +100,60 @@
     "\n",
     "# Latest SI fiscal year per org (end year as int)\n",
     "si_latest = (si.assign(lat_end=pd.to_numeric(si['fiscal_yr'].str.split('-').str[-1], errors='coerce'))\n",
-    "               .groupby('org_id', as_index=False)['lat_end'].max()\n",
-    "               .rename(columns={'lat_end':'latest_si_yr'}))\n",
+    "            .groupby('org_id', as_index=False)['lat_end'].max()\n",
+    "            .rename(columns={'lat_end':'latest_si_yr'}))\n",
     "\n",
     "drf = drf.merge(si_latest, on='org_id', how='left')\n",
     "\n",
     "\n",
     "# Split planned vs actual; only drop blank measures\n",
     "drf_actuals = drf[drf['planned_actual']=='actual'].dropna(subset=['measure']).copy()\n",
     "drf_planned = drf[drf['planned_actual']=='planned'].dropna(subset=['measure']).copy()\n",
-    "\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "3cf6e679",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "# Drop any actuals from the fiscal year in progress\n",
-    "# TODO: Turn this into a function that looks at the current datetime\n",
-    "current_yr = 2026\n",
-    "drf_actuals = drf_actuals[drf_actuals['measure_yr']<current_yr]\n",
-    "\n",
-    "# Determine the highest measure year for actuals\n",
-    "latest_actuals = (drf_actuals\n",
-    "                  .groupby(['org_id', 'program_id', 'spending_fte'], as_index=False)['report_yr']\n",
-    "                  .max()\n",
-    "                  .rename(columns={'report_yr':'report_yr_actuals'})\n",
-    ")\n",
+    "# The fiscal year in progress will be indicated with a \".\" in the ftes field\n",
+    "# Determine the highest measure/report year for actuals, i.e. max without a \".\" in the ftes field\n",
+    "latest_actuals = drf_actuals[(drf_actuals['spending_fte'] == 'ftes') & (drf_actuals['measure'] != '.')] \\\n",
+    "                    .groupby(['org_id', 'program_id'], as_index=False) \\\n",
+    "                    .agg(latest_report_yr_actuals=('report_yr', 'max'))\n",
+    "\n",
+    "# Merge in the highest measure year for actuals in the actuals table\n",
+    "drf_actuals = drf_actuals.merge(latest_actuals, \n",
+    "                                on=['org_id', 'program_id'],\n",
+    "                                how='left') \n",
+    "\n",
+    "# Only keep actual years that are less than or equal to the latest actual report year\n",
+    "# fillna(0) assures that all actual values are included, even if there are not associated actual report years\n",
+    "drf_actuals = drf_actuals[\n",
+    "    drf_actuals['measure_yr'] <= (drf_actuals['latest_report_yr_actuals'].fillna(0))\n",
+    "]\n",
+    "\n",
     "\n",
     "# Merge in the highest measure year for actuals in the planned table\n",
     "drf_planned = drf_planned.merge(latest_actuals, \n",
-    "                                on=['org_id', 'program_id', 'spending_fte'],\n",
+    "                                on=['org_id', 'program_id'],\n",
     "                                how='left') \n",
     "\n",
     "# Only keep planned years that are greater than the latest actual report year\n",
-    "# fillna(-np.inf) assures that all planned values are included, even if there are not associated actual report years\n",
+    "# fillna(0) assures that all planned values are included, even if there are not associated actual report years\n",
     "drf_planned = drf_planned[\n",
-    "    drf_planned['measure_yr'] > (drf_planned['report_yr_actuals'].fillna(0))\n",
+    "    drf_planned['measure_yr'] > (drf_planned['latest_report_yr_actuals'].fillna(0))\n",
     "]\n",
     "\n",
     "# # # Each report year has 3 measure years for planned values.\n",
     "# # Only keep records that have the highest report year for that given program, measure type, and measure year\n",
     "idx = (drf_planned\n",
-    "       .groupby(['org_id', 'program_id', 'spending_fte', 'measure_yr'])['report_yr']\n",
-    "       .idxmax())\n",
+    "    .groupby(['org_id', 'program_id', 'spending_fte', 'measure_yr'])['report_yr']\n",
+    "    .idxmax())\n",
     "drf_planned = drf_planned.loc[idx]\n",
     "\n",
     "# # # Concatenate actuals and planned entries\n",
@@ -176,16 +181,8 @@
     "drf['report_yr'] = (drf['report_yr']-1).apply(str) +\"-\"+ (drf['report_yr']).apply(str)\n",
     "drf['measure_yr'] = (drf['measure_yr']-1).apply(str) +\"-\"+ (drf['measure_yr']).apply(str)\n",
     "drf['si_link_yr'] = (drf['si_link_yr']-1).apply(str) +\"-\"+ (drf['si_link_yr']).apply(str)\n",
-    "drf['latest_si_yr'] = (drf['latest_si_yr']-1).apply(str) +\"-\"+ (drf['latest_si_yr']).apply(str)"
+    "drf['latest_si_yr'] = (drf['latest_si_yr']-1).apply(str) +\"-\"+ (drf['latest_si_yr']).apply(str)\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "dacefaa6",
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
 
@@ -2,7 +2,7 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
+   "execution_count": null,
    "id": "fd2d413a-666f-49d4-b86c-7d6898a496a5",
    "metadata": {},
    "outputs": [],
@@ -28,8 +28,6 @@
     "import pytz\n",
     "from pathlib import Path\n",
     "\n",
-    "\n",
-    "\n",
     "base_dir = Path.cwd()\n",
     "parent_dir = base_dir.parent\n",
     "config = get_config()"