Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions changelog_entry.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
- bump: patch
changes:
changed:
- renamed "ucgid" to "ucgid_str" in age targets loading script and operation to "in"
- removed [0.5] key access from imputation results as per microimpute's new output format
8 changes: 4 additions & 4 deletions policyengine_us_data/datasets/cps/cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,14 +186,14 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
imputed_values = fitted_model.predict(X_test=inference_df)
logging.info("Imputation complete.")
cps["rent"] = np.zeros_like(cps["age"])
cps["rent"][mask] = imputed_values[0.5]["rent"]
cps["rent"][mask] = imputed_values["rent"]
# Assume zero housing assistance since
cps["pre_subsidy_rent"] = cps["rent"]
cps["housing_assistance"] = np.zeros_like(
cps["spm_unit_capped_housing_subsidy_reported"]
)
cps["real_estate_taxes"] = np.zeros_like(cps["age"])
cps["real_estate_taxes"][mask] = imputed_values[0.5]["real_estate_taxes"]
cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"]


def add_takeup(self):
Expand Down Expand Up @@ -1618,7 +1618,7 @@ def add_tips(self, cps: h5py.File):
cps["tip_income"] = model.predict(
X_test=cps,
mean_quantile=0.5,
)[0.5].tip_income.values
).tip_income.values

self.save_dataset(cps)

Expand Down Expand Up @@ -1957,7 +1957,7 @@ def determine_reference_person(group):
imputations = fitted_model.predict(X_test=receiver_data)

for var in IMPUTED_VARIABLES:
cps[var] = imputations[0.5][var]
cps[var] = imputations[var]

cps["net_worth"] = cps["networth"]
del cps["networth"]
Expand Down
2 changes: 1 addition & 1 deletion policyengine_us_data/datasets/cps/extended_cps.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def impute_income_variables(

# Extract median predictions and add to result
for var in batch_vars:
result[var] = batch_predictions[0.5][var]
result[var] = batch_predictions[var]

# Clean up batch objects
del fitted_model
Expand Down
7 changes: 2 additions & 5 deletions policyengine_us_data/datasets/puf/puf.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,8 +189,7 @@ def impute_pension_contributions_to_puf(puf_df):
# Predict using the fitted model
predictions = fitted_model.predict(X_test=puf_df[["employment_income"]])

# Return the median (0.5 quantile) predictions
return predictions[0.5]["pre_tax_contributions"]
return predictions["pre_tax_contributions"]


def impute_missing_demographics(
Expand Down Expand Up @@ -242,12 +241,10 @@ def impute_missing_demographics(
].reset_index()

# Predict demographics
predictions = fitted_model.predict(
predicted_demographics = fitted_model.predict(
X_test=puf_without_demographics[NON_DEMOGRAPHIC_VARIABLES]
)

# Get median predictions
predicted_demographics = predictions[0.5]
puf_with_imputed_demographics = pd.concat(
[puf_without_demographics, predicted_demographics], axis=1
)
Expand Down
24 changes: 12 additions & 12 deletions policyengine_us_data/db/load_age_targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -174,18 +174,18 @@ def transform_age_data(age_data, docs):
)

df = df.drop(columns="NAME")
df = df.rename({"GEO_ID": "ucgid"}, axis=1)
df_data = df.rename(columns=rename_mapping)[["ucgid"] + list(AGE_COLS)]
df = df.rename({"GEO_ID": "ucgid_str"}, axis=1)
df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)]

# Filter out Puerto Rico's district and state records, if needed
df_geos = df_data[
~df_data["ucgid"].isin(["5001800US7298", "0400000US72"])
~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"])
].copy()

df = df_geos[["ucgid"] + AGE_COLS]
df = df_geos[["ucgid_str"] + AGE_COLS]

df_long = df.melt(
id_vars="ucgid",
id_vars="ucgid_str",
value_vars=AGE_COLS,
var_name="age_range",
value_name="value",
Expand All @@ -212,11 +212,11 @@ def load_age_data(df_long, geo, stratum_lookup={}):

# Quick data quality check before loading ----
if geo == "National":
assert len(set(df_long.ucgid)) == 1
assert len(set(df_long.ucgid_str)) == 1
elif geo == "State":
assert len(set(df_long.ucgid)) == 51
assert len(set(df_long.ucgid_str)) == 51
elif geo == "District":
assert len(set(df_long.ucgid)) == 436
assert len(set(df_long.ucgid_str)) == 436
else:
raise ValueError('geo must be one of "National", "State", "District"')

Expand All @@ -238,7 +238,7 @@ def load_age_data(df_long, geo, stratum_lookup={}):

# Create the parent Stratum object.
# We will attach children to it before adding it to the session.
note = f"Age: {row['age_range']}, Geo: {row['ucgid']}"
note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}"
parent_geo = get_parent_geo(geo)
parent_stratum_id = (
stratum_lookup[parent_geo][row["age_range"]]
Expand All @@ -253,9 +253,9 @@ def load_age_data(df_long, geo, stratum_lookup={}):
# Create constraints and link them to the parent's relationship attribute.
new_stratum.constraints_rel = [
StratumConstraint(
constraint_variable="ucgid",
operation="equals",
value=row["ucgid"],
constraint_variable="ucgid_str",
operation="in",
value=row["ucgid_str"],
),
StratumConstraint(
constraint_variable="age",
Expand Down
Loading