PolicyEngine · nikhilwoodruff · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025 · Aug 8, 2025
diff --git a/changelog_entry.yaml b/changelog_entry.yaml
@@ -0,0 +1,5 @@
+- bump: patch
+  changes:
+    changed:
+    - renamed "ucgid" to "ucgid_str" in age targets loading script and operation to "in"
+    - removed [0.5] key access from imputation results as per microimpute's new output format
diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py
@@ -186,14 +186,14 @@ def add_rent(self, cps: h5py.File, person: DataFrame, household: DataFrame):
     imputed_values = fitted_model.predict(X_test=inference_df)
     logging.info("Imputation complete.")
     cps["rent"] = np.zeros_like(cps["age"])
-    cps["rent"][mask] = imputed_values[0.5]["rent"]
+    cps["rent"][mask] = imputed_values["rent"]
     # Assume zero housing assistance since
     cps["pre_subsidy_rent"] = cps["rent"]
     cps["housing_assistance"] = np.zeros_like(
         cps["spm_unit_capped_housing_subsidy_reported"]
     )
     cps["real_estate_taxes"] = np.zeros_like(cps["age"])
-    cps["real_estate_taxes"][mask] = imputed_values[0.5]["real_estate_taxes"]
+    cps["real_estate_taxes"][mask] = imputed_values["real_estate_taxes"]
 
 
 def add_takeup(self):
@@ -1618,7 +1618,7 @@ def add_tips(self, cps: h5py.File):
     cps["tip_income"] = model.predict(
         X_test=cps,
         mean_quantile=0.5,
-    )[0.5].tip_income.values
+    ).tip_income.values
 
     self.save_dataset(cps)
 
@@ -1957,7 +1957,7 @@ def determine_reference_person(group):
     imputations = fitted_model.predict(X_test=receiver_data)
 
     for var in IMPUTED_VARIABLES:
-        cps[var] = imputations[0.5][var]
+        cps[var] = imputations[var]
 
     cps["net_worth"] = cps["networth"]
     del cps["networth"]

diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py
@@ -299,7 +299,7 @@ def impute_income_variables(
 
         # Extract median predictions and add to result
         for var in batch_vars:
-            result[var] = batch_predictions[0.5][var]
+            result[var] = batch_predictions[var]
 
         # Clean up batch objects
         del fitted_model

diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py
@@ -189,8 +189,7 @@ def impute_pension_contributions_to_puf(puf_df):
     # Predict using the fitted model
     predictions = fitted_model.predict(X_test=puf_df[["employment_income"]])
 
-    # Return the median (0.5 quantile) predictions
-    return predictions[0.5]["pre_tax_contributions"]
+    return predictions["pre_tax_contributions"]
 
 
 def impute_missing_demographics(
@@ -242,12 +241,10 @@ def impute_missing_demographics(
     ].reset_index()
 
     # Predict demographics
-    predictions = fitted_model.predict(
+    predicted_demographics = fitted_model.predict(
         X_test=puf_without_demographics[NON_DEMOGRAPHIC_VARIABLES]
     )
 
-    # Get median predictions
-    predicted_demographics = predictions[0.5]
     puf_with_imputed_demographics = pd.concat(
         [puf_without_demographics, predicted_demographics], axis=1
     )

diff --git a/policyengine_us_data/db/load_age_targets.py b/policyengine_us_data/db/load_age_targets.py
@@ -174,18 +174,18 @@ def transform_age_data(age_data, docs):
     )
 
     df = df.drop(columns="NAME")
-    df = df.rename({"GEO_ID": "ucgid"}, axis=1)
-    df_data = df.rename(columns=rename_mapping)[["ucgid"] + list(AGE_COLS)]
+    df = df.rename({"GEO_ID": "ucgid_str"}, axis=1)
+    df_data = df.rename(columns=rename_mapping)[["ucgid_str"] + list(AGE_COLS)]
 
     # Filter out Puerto Rico's district and state records, if needed
     df_geos = df_data[
-        ~df_data["ucgid"].isin(["5001800US7298", "0400000US72"])
+        ~df_data["ucgid_str"].isin(["5001800US7298", "0400000US72"])
     ].copy()
 
-    df = df_geos[["ucgid"] + AGE_COLS]
+    df = df_geos[["ucgid_str"] + AGE_COLS]
 
     df_long = df.melt(
-        id_vars="ucgid",
+        id_vars="ucgid_str",
         value_vars=AGE_COLS,
         var_name="age_range",
         value_name="value",
@@ -212,11 +212,11 @@ def load_age_data(df_long, geo, stratum_lookup={}):
 
     # Quick data quality check before loading ----
     if geo == "National":
-        assert len(set(df_long.ucgid)) == 1
+        assert len(set(df_long.ucgid_str)) == 1
     elif geo == "State":
-        assert len(set(df_long.ucgid)) == 51
+        assert len(set(df_long.ucgid_str)) == 51
     elif geo == "District":
-        assert len(set(df_long.ucgid)) == 436
+        assert len(set(df_long.ucgid_str)) == 436
     else:
         raise ValueError('geo must be one of "National", "State", "District"')
 
@@ -238,7 +238,7 @@ def load_age_data(df_long, geo, stratum_lookup={}):
 
         # Create the parent Stratum object.
         # We will attach children to it before adding it to the session.
-        note = f"Age: {row['age_range']}, Geo: {row['ucgid']}"
+        note = f"Age: {row['age_range']}, Geo: {row['ucgid_str']}"
         parent_geo = get_parent_geo(geo)
         parent_stratum_id = (
             stratum_lookup[parent_geo][row["age_range"]]
@@ -253,9 +253,9 @@ def load_age_data(df_long, geo, stratum_lookup={}):
         # Create constraints and link them to the parent's relationship attribute.
         new_stratum.constraints_rel = [
             StratumConstraint(
-                constraint_variable="ucgid",
-                operation="equals",
-                value=row["ucgid"],
+                constraint_variable="ucgid_str",
+                operation="in",
+                value=row["ucgid_str"],
             ),
             StratumConstraint(
                 constraint_variable="age",