|
5 | 5 | from policyengine_us_data.datasets.puf import * |
6 | 6 | import pandas as pd |
7 | 7 | import os |
8 | | -from policyengine_us_data.utils import QRF |
| 8 | +from microimpute.models.qrf import QRF |
9 | 9 | import time |
10 | 10 | import logging |
| 11 | +import gc |
11 | 12 |
|
12 | 13 | # These are sorted by magnitude. |
13 | 14 | # First 15 contain 90%. |
|
28 | 29 | "self_employment_income", |
29 | 30 | "w2_wages_from_qualified_business", |
30 | 31 | "unadjusted_basis_qualified_property", |
31 | | - "business_is_sstb", |
| 32 | + "business_is_sstb", # bool |
32 | 33 | "short_term_capital_gains", |
33 | 34 | "qualified_dividend_income", |
34 | 35 | "charitable_cash_donations", |
@@ -220,25 +221,103 @@ def impute_income_variables( |
220 | 221 | predictors: list[str] = None, |
221 | 222 | outputs: list[str] = None, |
222 | 223 | ): |
223 | | - X_train = puf_sim.calculate_dataframe(predictors) |
224 | | - y_train = puf_sim.calculate_dataframe(outputs) |
225 | | - X = cps_sim.calculate_dataframe(predictors) |
226 | | - y = pd.DataFrame(columns=outputs, index=X.index) |
227 | | - model = QRF() |
228 | | - start = time.time() |
229 | | - model.fit( |
230 | | - X_train, |
231 | | - y_train, |
| 224 | + |
| 225 | + # Calculate all variables together to preserve dependencies |
| 226 | + X_train = puf_sim.calculate_dataframe(predictors + outputs) |
| 227 | + |
| 228 | + # Check which outputs are actually in the result |
| 229 | + available_outputs = [col for col in outputs if col in X_train.columns] |
| 230 | + missing_outputs = [col for col in outputs if col not in X_train.columns] |
| 231 | + |
| 232 | + if missing_outputs: |
| 233 | + logging.warning( |
| 234 | + f"The following {len(missing_outputs)} variables were not calculated: {missing_outputs}" |
| 235 | + ) |
| 236 | + # Log the specific missing variable that's causing issues |
| 237 | + if "recapture_of_investment_credit" in missing_outputs: |
| 238 | + logging.error( |
| 239 | + "recapture_of_investment_credit is missing from PUF calculation!" |
| 240 | + ) |
| 241 | + |
| 242 | + logging.info( |
| 243 | + f"X_train shape: {X_train.shape}, columns: {len(X_train.columns)}" |
232 | 244 | ) |
| 245 | + |
| 246 | + X_test = cps_sim.calculate_dataframe(predictors) |
| 247 | + |
233 | 248 | logging.info( |
234 | | - f"Training imputation models from the PUF took {time.time() - start:.2f} seconds" |
| 249 | + f"Imputing {len(available_outputs)} variables using batched sequential QRF" |
235 | 250 | ) |
236 | | - start = time.time() |
237 | | - y = model.predict(X) |
| 251 | + total_start = time.time() |
| 252 | + |
| 253 | + # Batch variables to avoid memory issues with sequential imputation |
| 254 | + batch_size = 10 # Reduce to 10 variables at a time |
| 255 | + result = pd.DataFrame(index=X_test.index) |
| 256 | + |
| 257 | + # Sample training data more aggressively upfront |
| 258 | + sample_size = min(5000, len(X_train)) # Reduced from 5000 |
| 259 | + if len(X_train) > sample_size: |
| 260 | + logging.info( |
| 261 | + f"Sampling training data from {len(X_train)} to {sample_size} rows" |
| 262 | + ) |
| 263 | + X_train_sampled = X_train.sample(n=sample_size, random_state=42) |
| 264 | + else: |
| 265 | + X_train_sampled = X_train |
| 266 | + |
| 267 | + for batch_start in range(0, len(available_outputs), batch_size): |
| 268 | + batch_end = min(batch_start + batch_size, len(available_outputs)) |
| 269 | + batch_vars = available_outputs[batch_start:batch_end] |
| 270 | + |
| 271 | + logging.info( |
| 272 | + f"Processing batch {batch_start//batch_size + 1}: variables {batch_start+1}-{batch_end} ({batch_vars})" |
| 273 | + ) |
| 274 | + |
| 275 | + # Force garbage collection before each batch |
| 276 | + gc.collect() |
| 277 | + |
| 278 | + # Create a fresh QRF for each batch |
| 279 | + qrf = QRF( |
| 280 | + log_level="INFO", |
| 281 | + memory_efficient=True, |
| 282 | + batch_size=10, |
| 283 | + cleanup_interval=5, |
| 284 | + ) |
| 285 | + |
| 286 | + # Use pre-sampled data for this batch |
| 287 | + batch_X_train = X_train_sampled[predictors + batch_vars].copy() |
| 288 | + |
| 289 | + # Fit model for this batch with sequential imputation within the batch |
| 290 | + fitted_model = qrf.fit( |
| 291 | + X_train=batch_X_train, |
| 292 | + predictors=predictors, |
| 293 | + imputed_variables=batch_vars, |
| 294 | + n_jobs=1, # Single thread to reduce memory overhead |
| 295 | + ) |
| 296 | + |
| 297 | + # Predict for this batch |
| 298 | + batch_predictions = fitted_model.predict(X_test=X_test) |
| 299 | + |
| 300 | + # Extract median predictions and add to result |
| 301 | + for var in batch_vars: |
| 302 | + result[var] = batch_predictions[0.5][var] |
| 303 | + |
| 304 | + # Clean up batch objects |
| 305 | + del fitted_model |
| 306 | + del batch_predictions |
| 307 | + del batch_X_train |
| 308 | + gc.collect() |
| 309 | + |
| 310 | + logging.info(f"Completed batch {batch_start//batch_size + 1}") |
| 311 | + |
| 312 | + # Add zeros for missing variables |
| 313 | + for var in missing_outputs: |
| 314 | + result[var] = 0 |
| 315 | + |
238 | 316 | logging.info( |
239 | | - f"Predicting imputed values took {time.time() - start:.2f} seconds" |
| 317 | + f"Imputing {len(available_outputs)} variables took {time.time() - total_start:.2f} seconds total" |
240 | 318 | ) |
241 | | - return y |
| 319 | + |
| 320 | + return result |
242 | 321 |
|
243 | 322 |
|
244 | 323 | class ExtendedCPS_2024(ExtendedCPS): |
|
0 commit comments