Skip to content

Commit 6d482e7

Browse files
committed
all major targets loaded
1 parent e45072e commit 6d482e7

File tree

11 files changed

+721
-541
lines changed

11 files changed

+721
-541
lines changed

Makefile

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,11 @@ documentation-dev:
6161

6262
database:
6363
python policyengine_us_data/db/create_database_tables.py
64-
python policyengine_us_data/db/load_age_targets.py
64+
python policyengine_us_data/db/create_initial_strata.py
65+
python policyengine_us_data/db/etl_age.py
66+
python policyengine_us_data/db/etl_medicaid.py
67+
python policyengine_us_data/db/etl_snap.py
68+
python policyengine_us_data/db/etl_irs_soi.py
6569

6670
clean-database:
6771
rm *.db
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
from typing import Dict
2+
3+
import pandas as pd
4+
from sqlalchemy import create_engine
5+
from sqlalchemy.orm import sessionmaker
6+
from sqlmodel import SQLModel, Session, select
7+
8+
9+
from policyengine_us.variables.household.demographic.geographic.ucgid.ucgid_enum import UCGID
10+
from policyengine_us_data.db.create_database_tables import (
11+
Stratum,
12+
StratumConstraint,
13+
)
14+
15+
16+
17+
def main():
18+
# Get the implied hierarchy by the UCGID enum --------
19+
rows = []
20+
for node in UCGID:
21+
codes = node.get_hierarchical_codes()
22+
rows.append({
23+
"name": node.name,
24+
"code": codes[0],
25+
"parent": codes[1] if len(codes) > 1 else None
26+
})
27+
28+
hierarchy_df = (
29+
pd.DataFrame(rows)
30+
.sort_values(["parent", "code"], na_position="first")
31+
.reset_index(drop=True)
32+
)
33+
34+
35+
DATABASE_URL = "sqlite:///policy_data.db"
36+
engine = create_engine(DATABASE_URL)
37+
38+
Session = sessionmaker(bind=engine)
39+
session = Session()
40+
41+
# map the ucgid_str 'code' to auto-generated 'stratum_id'
42+
code_to_stratum_id: Dict[str, int] = {}
43+
44+
for _, row in hierarchy_df.iterrows():
45+
parent_code = row["parent"]
46+
47+
parent_id = code_to_stratum_id.get(parent_code) if parent_code else None
48+
49+
new_stratum = Stratum(
50+
parent_stratum_id=parent_id,
51+
notes=f'{row["name"]} (ucgid {row["code"]})',
52+
stratum_group_id=1,
53+
)
54+
55+
new_stratum.constraints_rel = [
56+
StratumConstraint(
57+
constraint_variable="ucgid_str",
58+
operation="in",
59+
value=row["code"],
60+
)
61+
]
62+
63+
session.add(new_stratum)
64+
65+
session.flush()
66+
67+
code_to_stratum_id[row["code"]] = new_stratum.stratum_id
68+
69+
session.commit()
70+
71+
if __name__ == "__main__":
72+
main()

policyengine_us_data/db/etl_age.py

Lines changed: 27 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
StratumConstraint,
1313
Target,
1414
)
15+
from policyengine_us_data.utils.census import get_census_docs, pull_acs_table
1516

1617

1718
LABEL_TO_SHORT = {
@@ -32,67 +33,11 @@
3233
"Estimate!!Total!!Total population!!AGE!!70 to 74 years": "70-74",
3334
"Estimate!!Total!!Total population!!AGE!!75 to 79 years": "75-79",
3435
"Estimate!!Total!!Total population!!AGE!!80 to 84 years": "80-84",
35-
"Estimate!!Total!!Total population!!AGE!!85 years and over": "85-inf",
36+
"Estimate!!Total!!Total population!!AGE!!85 years and over": "85-999",
3637
}
3738
AGE_COLS = list(LABEL_TO_SHORT.values())
3839

3940

40-
def extract_docs(year=2023):
41-
docs_url = (
42-
f"https://api.census.gov/data/{year}/acs/acs1/subject/variables.json"
43-
)
44-
45-
try:
46-
docs_response = requests.get(docs_url)
47-
docs_response.raise_for_status()
48-
49-
docs = docs_response.json()
50-
docs["year"] = year
51-
52-
except requests.exceptions.RequestException as e:
53-
print(f"Error during API request: {e}")
54-
raise
55-
except Exception as e:
56-
print(f"An error occurred: {e}")
57-
raise
58-
return docs
59-
60-
61-
def extract_age_data(geo, year=2023):
62-
base_url = (
63-
f"https://api.census.gov/data/{year}/acs/acs1/subject?get=group(S0101)"
64-
)
65-
66-
if geo == "State":
67-
url = f"{base_url}&for=state:*"
68-
elif geo == "District":
69-
url = f"{base_url}&for=congressional+district:*"
70-
elif geo == "National":
71-
url = f"{base_url}&for=us:*"
72-
else:
73-
raise ValueError(
74-
"geo must be either 'National', 'State', or 'District'"
75-
)
76-
77-
try:
78-
response = requests.get(url)
79-
response.raise_for_status()
80-
81-
data = response.json()
82-
83-
headers = data[0]
84-
data_rows = data[1:]
85-
df = pd.DataFrame(data_rows, columns=headers)
86-
87-
except requests.exceptions.RequestException as e:
88-
print(f"Error during API request: {e}")
89-
raise
90-
except Exception as e:
91-
print(f"An error occurred: {e}")
92-
raise
93-
return df
94-
95-
9641
def transform_age_data(age_data, docs):
9742
df = age_data.copy()
9843

@@ -131,13 +76,14 @@ def transform_age_data(age_data, docs):
13176
var_name="age_range",
13277
value_name="value",
13378
)
134-
age_bounds = df_long["age_range"].str.split("-", expand=True)
135-
df_long["age_greater_than_or_equal_to"] = (
136-
age_bounds[0].str.replace("+", "").astype(int)
137-
)
138-
df_long["age_less_than_or_equal_to"] = pd.to_numeric(age_bounds[1])
79+
age_bounds = df_long["age_range"].str.split("-", expand=True).astype(int)
80+
age_bounds.columns = ["ge", "le"]
81+
age_bounds[['gt']] = age_bounds[["ge"]] - 1
82+
age_bounds[['lt']] = age_bounds[["le"]] + 1
83+
84+
df_long["age_greater_than"] = age_bounds[["gt"]]
85+
df_long["age_less_than"] = age_bounds[["lt"]]
13986
df_long["variable"] = "person_count"
140-
df_long["period"] = docs["year"]
14187
df_long["reform_id"] = 0
14288
df_long["source_id"] = 1
14389
df_long["active"] = True
@@ -149,7 +95,7 @@ def get_parent_geo(geo):
14995
return {"National": None, "State": "National", "District": "State"}[geo]
15096

15197

152-
def load_age_data(df_long, geo, stratum_lookup={}):
98+
def load_age_data(df_long, geo, year, stratum_lookup={}):
15399

154100
# Quick data quality check before loading ----
155101
if geo == "National":
@@ -192,6 +138,7 @@ def load_age_data(df_long, geo, stratum_lookup={}):
192138
)
193139

194140
# Create constraints and link them to the parent's relationship attribute.
141+
# TODO: greater_than_or_equal_to to just greater than!
195142
new_stratum.constraints_rel = [
196143
StratumConstraint(
197144
constraint_variable="ucgid_str",
@@ -200,26 +147,26 @@ def load_age_data(df_long, geo, stratum_lookup={}):
200147
),
201148
StratumConstraint(
202149
constraint_variable="age",
203-
operation="greater_than_or_equal",
204-
value=str(row["age_greater_than_or_equal_to"]),
150+
operation="greater_than",
151+
value=str(row["age_greater_than"]),
205152
),
206153
]
207154

208-
age_lt_value = row["age_less_than_or_equal_to"]
155+
age_lt_value = row["age_less_than"]
209156
if not np.isinf(age_lt_value):
210157
new_stratum.constraints_rel.append(
211158
StratumConstraint(
212159
constraint_variable="age",
213160
operation="less_than",
214-
value=str(age_lt_value + 1),
161+
value=str(row["age_less_than"]),
215162
)
216163
)
217164

218165
# Create the Target and link it to the parent.
219166
new_stratum.targets_rel.append(
220167
Target(
221168
variable=row["variable"],
222-
period=row["period"],
169+
period=year,
223170
value=row["value"],
224171
source_id=row["source_id"],
225172
active=row["active"],
@@ -243,18 +190,24 @@ def load_age_data(df_long, geo, stratum_lookup={}):
243190
if __name__ == "__main__":
244191

245192
# --- ETL: Extract, Transform, Load ----
193+
year = 2023
246194

247195
# ---- Extract ----------
248-
docs = extract_docs(2023)
249-
national_df = extract_age_data("National", 2023)
250-
state_df = extract_age_data("State", 2023)
196+
docs = get_census_docs(year)
197+
national_df = pull_acs_table("S0101", "National", year)
198+
state_df = pull_acs_table("S0101", "State", year)
199+
district_df = pull_acs_table("S0101", "District", year)
251200

252201
# --- Transform ----------
253202
long_national_df = transform_age_data(national_df, docs)
254203
long_state_df = transform_age_data(state_df, docs)
204+
long_district_df = transform_age_data(district_df, docs)
255205

256206
# --- Load --------
257-
national_strata_lku = load_age_data(long_national_df, "National")
207+
national_strata_lku = load_age_data(long_national_df, "National", year)
258208
state_strata_lku = load_age_data(
259-
long_state_df, "State", national_strata_lku
209+
long_state_df, "State", year, national_strata_lku
210+
)
211+
load_age_data(
212+
long_district_df, "District", year, state_strata_lku
260213
)

0 commit comments

Comments
 (0)