diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 9771b43f..ca6759b2 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -231,6 +231,7 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: np.random.randint(80, 85, len(person)), person.A_AGE, ) + # A_SEX is 1 -> male, 2 -> female. cps["is_female"] = person.A_SEX == 2 # "Is...blind or does...have serious difficulty seeing even when Wearing @@ -241,6 +242,29 @@ def add_personal_variables(cps: h5py.File, person: DataFrame) -> None: ] cps["is_disabled"] = (person[DISABILITY_FLAGS] == 1).any(axis=1) + def _assign_some_newborns_to_pregnancy( + age: pd.Series, person: pd.DataFrame + ) -> pd.Series: + """Takes an array of ages, returns the new age array with the given percentage of newborns assigned a negative age (in pregnancy).""" + age = np.where( + person.A_AGE == 0, + np.where( + np.random.randint( + 0, 2, len(person) + ), # Random number of 0 or 1 + # If 1 is flipped, select a random number between -0.75 and 0 + # This will represent the pregnany month + # At -0.75 the pregnancy month is 0 and at -0.0001 the pregnancy month is 9 + np.random.uniform(-0.75, 0, len(person)), + # If 0 is flipped, the child is a newborn at the age of 0 to 1 + np.random.uniform(0, 1, len(person)), + ), + person.A_AGE, + ) + return age + + cps["age"] = _assign_some_newborns_to_pregnancy(cps["age"], person) + def children_per_parent(col: str) -> pd.DataFrame: """Calculate number of children in the household using parental pointers. diff --git a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py index 4d9c7f9b..e6bf595c 100644 --- a/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py +++ b/policyengine_us_data/tests/test_datasets/test_enhanced_cps.py @@ -37,3 +37,20 @@ def test_ecps_has_mortgage_interest(): assert sim.calculate("deductible_mortgage_interest").sum() > 1 assert sim.calculate("deductible_interest_expense").sum() > 1 + + +def test_newborns_and_pregnancies(): + from policyengine_us_data.datasets.cps import EnhancedCPS_2024 + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=EnhancedCPS_2024) + + # Test for unborn children (age < 0) + unborn = sim.calculate("age") < 0 + unborn_count = unborn.sum() + assert unborn_count > 0 + + # Test for newborns (0 <= age < 1) + newborns = (sim.calculate("age") >= 0) & (sim.calculate("age") < 1) + newborn_count = newborns.sum() + assert newborn_count > 0 diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index a01b16a4..b743dc46 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -340,6 +340,19 @@ def build_loss_matrix(dataset: type, time_period): ) targets_array.append(row["population_under_5"]) + # Population by number of newborns and pregancies + + age = sim.calculate("age").values + infants = (age >= 0) & (age < 1) + label = "census/infants" + loss_matrix[label] = sim.map_result(infants, "person", "household") + targets_array.append(3_491_679) + + pregnancies = (age >= -0.75) & (age < 0) + label = "census/pregnancies" + loss_matrix[label] = sim.map_result(pregnancies, "person", "household") + targets_array.append(2_618_759) + if any(loss_matrix.isna().sum() > 0): raise ValueError("Some targets are missing from the loss matrix")