Skip to content

Commit 7826a30

Browse files
committed
updated sid-list to display latest and first fy reported
1 parent 799a4f9 commit 7826a30

29 files changed

+6658
-6471
lines changed

inputs/si_2024.csv

Lines changed: 8 additions & 8 deletions
Large diffs are not rendered by default.

notebooks/experiments-drf.ipynb

Lines changed: 214 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,214 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "f9a37f13-0b46-42f5-b8fe-99ff5099f890",
6+
"metadata": {},
7+
"source": [
8+
"# Figuring out the connection to DRF/DRR/DP figures"
9+
]
10+
},
11+
{
12+
"cell_type": "code",
13+
"execution_count": 42,
14+
"id": "a65fa6c7-24f9-4fb6-8382-096f3c487d10",
15+
"metadata": {},
16+
"outputs": [],
17+
"source": [
18+
"import pandas as pd\n",
19+
"import numpy as np\n",
20+
"import re\n",
21+
"import pytz\n",
22+
"import os\n",
23+
"from pathlib import Path\n",
24+
"import sys\n",
25+
"sys.path.append(\"/home/jovyan/shared/service-data\")\n",
26+
"\n",
27+
"from src.clean import clean_percentage, normalize_string, standardize_column_names, clean_fiscal_yr\n",
28+
"from src.load import load_csv_from_raw\n",
29+
"from src.export import export_to_csv\n",
30+
"from src.merge import merge_si, merge_ss"
31+
]
32+
},
33+
{
34+
"cell_type": "code",
35+
"execution_count": 43,
36+
"id": "048e8db3-f8e2-45f0-a9de-5ca5bc0fe351",
37+
"metadata": {},
38+
"outputs": [
39+
{
40+
"name": "stdout",
41+
"output_type": "stream",
42+
"text": [
43+
"Exported dept.csv to /home/jovyan/shared/service-data/outputs/utils\n",
44+
"Exported si.csv to /home/jovyan/shared/service-data/outputs\n"
45+
]
46+
}
47+
],
48+
"source": [
49+
"# Define the base directory\n",
50+
"base_dir = Path.cwd()\n",
51+
"parent_dir = base_dir.parent\n",
52+
"\n",
53+
"# File paths for outputs\n",
54+
"data_files = {\n",
55+
" \"rbpo\": parent_dir / \"inputs\" / \"rbpo.csv\",\n",
56+
" \"org_var\": parent_dir / \"inputs\" / \"org_var.csv\",\n",
57+
" \"serv_prog\": parent_dir / \"inputs\" / \"serv_prog.csv\"\n",
58+
"}\n",
59+
"\n",
60+
"si = merge_si()\n",
61+
"rbpo = pd.read_csv(data_files[\"rbpo\"])\n",
62+
"serv_prog = pd.read_csv(data_files[\"serv_prog\"])\n",
63+
"\n",
64+
"rbpo = standardize_column_names(rbpo)\n",
65+
"rbpo['fiscal_yr'] = rbpo['fiscal_yr'].apply(clean_fiscal_yr)"
66+
]
67+
},
68+
{
69+
"cell_type": "markdown",
70+
"id": "77e12a73-f357-4834-a99f-eab56fcc87dd",
71+
"metadata": {},
72+
"source": [
73+
"# Define columns related to measures: spending and FTEs (planned and actual)\n",
74+
"fte_spend_cols = [\n",
75+
" 'planned_spending_1', 'actual_spending', 'planned_spending_2', 'planned_spending_3',\n",
76+
" 'planned_ftes_1', 'actual_ftes', 'planned_ftes_2', 'planned_ftes_3'\n",
77+
"]\n",
78+
"\n",
79+
"# Melt (unpivot) the DataFrame to long format\n",
80+
"drf = pd.melt(\n",
81+
" rbpo, \n",
82+
" id_vars=['fiscal_yr', 'org_id', 'program_id'], \n",
83+
" value_vars=fte_spend_cols, \n",
84+
" var_name='plan_actual_spendfte_yr', \n",
85+
" value_name='measure'\n",
86+
")\n",
87+
"\n",
88+
"# Split 'plan_actual_yr' into separate columns for planned/actual, spending/FTEs, and year adjustment\n",
89+
"drf[['planned_actual', 'spending_fte', 'yr_adjust']] = drf['plan_actual_spendfte_yr'].str.split('_', expand=True)\n",
90+
"drf['yr_adjust'] = drf['yr_adjust'].fillna('1').astype(int) - 1\n",
91+
"\n",
92+
"# Calculate 4-digit 'measure_yr' and 'report_yr' from 'fiscal_yr' and 'yr_adjust'\n",
93+
"drf['measure_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int) + drf['yr_adjust']\n",
94+
"drf['report_yr'] = drf['fiscal_yr'].str.split('-').str[1].astype(int)\n",
95+
"\n",
96+
"# Get the latest fiscal year from the Service inventory (four digit fy, year of end of fy)\n",
97+
"# latest_si_fy = si['fiscal_yr'].str.split('-').str[1].astype(int).max()\n",
98+
"latest_si_fy = 2024\n",
99+
"\n",
100+
"# Separate actuals and future planned data\n",
101+
"drf_actuals = drf[\n",
102+
" (drf['planned_actual'] == 'actual') & \n",
103+
" (drf['report_yr'] <= latest_si_fy)\n",
104+
"].dropna()\n",
105+
"\n",
106+
"drf_planned = drf[\n",
107+
" (drf['planned_actual'] == 'planned') &\n",
108+
" (drf['report_yr'] > latest_si_fy) \n",
109+
"].dropna()\n",
110+
"\n",
111+
"# Each report year has 3 measure years for planned values.\n",
112+
"# Only keep records that have the highest report year for that given program, measure type, and measure year\n",
113+
"idx = drf_planned.groupby(['program_id', 'spending_fte', 'measure_yr'])['report_yr'].idxmax()\n",
114+
"drf_planned = drf_planned.loc[idx]\n",
115+
"\n",
116+
"drf_actuals_checksum = drf_actuals['measure'].sum()\n",
117+
"drf_planned_checksum = drf_planned['measure'].sum()\n",
118+
"\n",
119+
"print(\"drf_actuals.shape:\", drf_actuals.shape)\n",
120+
"print(\"checksum:\", drf_actuals_checksum)\n",
121+
"print(\"drf_planned.shape:\", drf_planned.shape)\n",
122+
"print(\"checksum:\", drf_planned_checksum)\n",
123+
"\n",
124+
"# Concatenate actuals and planned entries\n",
125+
"drf = pd.concat([drf_actuals, drf_planned])\n",
126+
"drf_checksum = drf['measure'].sum()\n",
127+
"\n",
128+
"print(\"drf.shape:\", drf.shape)\n",
129+
"print(\"checksum:\", drf_checksum)\n",
130+
"print(\"checksum difference:\", drf_checksum - (drf_planned_checksum+drf_actuals_checksum))\n",
131+
"print(drf.info())\n",
132+
"\n",
133+
"# Pivot to get a wide format table with spending/FTE columns\n",
134+
"print(\"pivoting drf\")\n",
135+
"drf = drf.pivot_table(\n",
136+
" index=['org_id', 'program_id', 'report_yr', 'measure_yr', 'planned_actual'], \n",
137+
" columns=['spending_fte'], \n",
138+
" values='measure'\n",
139+
").sort_values(\n",
140+
" by=['org_id', 'program_id', 'report_yr','measure_yr']\n",
141+
").reset_index()\n",
142+
"\n",
143+
"print(\"drf.shape:\", drf.shape)\n",
144+
"\n",
145+
"ftes_checksum = drf['ftes'].sum()\n",
146+
"print('ftes_checksum:', ftes_checksum)\n",
147+
"spending_checksum = drf['spending'].sum()\n",
148+
"print('spending_checksum:', spending_checksum)\n",
149+
"print(\"checksum difference:\", drf_checksum - (ftes_checksum+spending_checksum))\n",
150+
"print(drf.info())\n",
151+
"\n",
152+
"# Set up si_link_yr: a fiscal year column to be able to include years \n",
153+
"# beyond the service inventory when joining by service id and fy.\n",
154+
"# if measure year > latest service fy, = latest service fy, else use measure_yr\n",
155+
"drf.loc[drf['measure_yr']>latest_si_fy, 'si_link_yr'] = latest_si_fy\n",
156+
"drf.loc[drf['measure_yr']<=latest_si_fy, 'si_link_yr'] = drf['measure_yr']\n",
157+
"drf['si_link_yr'] = drf['si_link_yr'].astype(int) \n",
158+
"\n",
159+
"\n",
160+
"drf_files = {\n",
161+
" \"drf_actuals\":drf_actuals,\n",
162+
" \"drf_planned\": drf_planned,\n",
163+
" \"drf\": drf\n",
164+
"}\n",
165+
"\n",
166+
"\n",
167+
"#export_to_csv(drf_files, Path.cwd())"
168+
]
169+
},
170+
{
171+
"cell_type": "markdown",
172+
"id": "4259c781-eaa3-4967-8839-8b78fbca60fe",
173+
"metadata": {},
174+
"source": [
175+
"si_drf = si.loc[:, ['service_id', 'fiscal_yr', 'program_id']]\n",
176+
"si_drf = si_drf.explode('program_id')\n",
177+
"si_drf['si_yr'] = si_drf['fiscal_yr'].str.split('-').str[1].astype(int)\n",
178+
"si_drf = si_drf[si_drf['program_id'].notna()]\n",
179+
"\n",
180+
"service_fte_spending = pd.merge(\n",
181+
" si_drf, \n",
182+
" drf, \n",
183+
" how='left', \n",
184+
" left_on=['si_yr', 'program_id'], \n",
185+
" right_on=['si_link_yr', 'program_id']\n",
186+
")\n",
187+
"\n",
188+
"print(service_fte_spending.info())\n",
189+
"service_fte_spending\n"
190+
]
191+
}
192+
],
193+
"metadata": {
194+
"kernelspec": {
195+
"display_name": "Python 3 (ipykernel)",
196+
"language": "python",
197+
"name": "python3"
198+
},
199+
"language_info": {
200+
"codemirror_mode": {
201+
"name": "ipython",
202+
"version": 3
203+
},
204+
"file_extension": ".py",
205+
"mimetype": "text/x-python",
206+
"name": "python",
207+
"nbconvert_exporter": "python",
208+
"pygments_lexer": "ipython3",
209+
"version": "3.12.8"
210+
}
211+
},
212+
"nbformat": 4,
213+
"nbformat_minor": 5
214+
}

0 commit comments

Comments
 (0)