diff --git a/.env.example b/.env.example deleted file mode 100644 index 413cd7c1..00000000 --- a/.env.example +++ /dev/null @@ -1,15 +0,0 @@ -## Copy this file to `.env` and fill in the values as needed. - -# Local development database (default if not set) -DATABASE_URL=sqlite:///policyengine.db - -# PolicyEngine live database connection pieces (used when --db-location policyengine) -# The CLI composes the URL as postgresql+psycopg2://... with sslmode=require by default. -POLICYENGINE_DB_PASSWORD= -POLICYENGINE_DB_USER=postgres -POLICYENGINE_DB_HOST=db.usugnrssspkdutcjeevk.supabase.co -POLICYENGINE_DB_PORT=5432 -POLICYENGINE_DB_NAME=postgres - -# Optional: Hugging Face token for private repos when seeding datasets from HF -HUGGING_FACE_TOKEN= diff --git a/.gitignore b/.gitignore index dc335293..57a0fc21 100644 --- a/.gitignore +++ b/.gitignore @@ -1,9 +1,8 @@ **/*.db **/__pycache__ **/*.egg-info +**/*.h5 +*.ipynb _build/ -simulations/ -test.* -supabase/ .env -**/review.md +**/.DS_Store \ No newline at end of file diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 00000000..be48ac80 --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,17 @@ +# Claude notes + +Claude, please follow these always. These principles are aimed at preventing you from producing AI slop. + +1. British English, sentence case +2. No excessive duplication, keep code files as concise as possible to produce the same meaningful value. No excessive printing +3. Don't create multiple files for successive versions. Keep checking: have I added lots of intermediate files which are deprecated? Delete them if so, but ideally don't create them in the first place + +## MicroDataFrame + +A pandas DataFrame that automatically handles weights for survey microdata. Key features: + +- Create with `MicroDataFrame(df, weights='weight_column')` +- All aggregations (sum, mean, etc.) automatically weight results +- Each column is a MicroSeries with weighted operations +- Use `.groupby()` for weighted group statistics +- Built-in poverty analysis: `.poverty_rate()`, `.poverty_gap()` diff --git a/Makefile b/Makefile index 931fccdd..f1e4b163 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,13 @@ format: ruff format . clean: - rm -rf **/__pycache__ _build **/_build .pytest_cache .ruff_cache **/*.egg-info **/*.pyc + find . -not -path "./.venv/*" -type d -name "__pycache__" -exec rm -rf {} + + find . -not -path "./.venv/*" -type d -name "_build" -exec rm -rf {} + + find . -not -path "./.venv/*" -type d -name ".pytest_cache" -exec rm -rf {} + + find . -not -path "./.venv/*" -type d -name ".ruff_cache" -exec rm -rf {} + + find . -not -path "./.venv/*" -type d -name "*.egg-info" -exec rm -rf {} + + find . -not -path "./.venv/*" -type f -name "*.pyc" -delete + find . -not -path "./.venv/*" -type f -name "*.h5" -delete changelog: build-changelog changelog.yaml --output changelog.yaml --update-last-date --start-from 1.0.0 --append-file changelog_entry.yaml diff --git a/README.md b/README.md index 11d18575..8c89c37f 100644 --- a/README.md +++ b/README.md @@ -1,12 +1,186 @@ # PolicyEngine.py -Documentation +A Python package for tax-benefit microsimulation analysis. Run policy simulations, analyse distributional impacts, and visualise results across the UK and US. -- Parameters, variables, and values: `docs/01_parameters_variables.ipynb` -- Policies and dynamic: `docs/02_policies_dynamic.ipynb` -- Datasets: `docs/03_datasets.ipynb` -- Simulations: `docs/04_simulations.ipynb` -- Output data items: `docs/05_output_data_items.ipynb` -- Reports and users: `docs/06_reports_users.ipynb` +## Quick start -Open these notebooks in Jupyter or your preferred IDE to run the examples. +```python +from policyengine.core import Simulation +from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset, uk_latest +from policyengine.outputs.aggregate import Aggregate, AggregateType + +# Load representative microdata +dataset = PolicyEngineUKDataset( + name="FRS 2023-24", + filepath="./data/frs_2023_24_year_2026.h5", + year=2026, +) + +# Run simulation +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Calculate total universal credit spending +agg = Aggregate( + simulation=simulation, + variable="universal_credit", + aggregate_type=AggregateType.SUM, + entity="benunit", +) +agg.run() +print(f"Total UC spending: £{agg.result / 1e9:.1f}bn") +``` + +## Documentation + +**Core concepts:** +- [Core concepts](docs/core-concepts.md): Architecture, datasets, simulations, outputs +- [UK tax-benefit model](docs/country-models-uk.md): Entities, parameters, examples +- [US tax-benefit model](docs/country-models-us.md): Entities, parameters, examples + +**Examples:** +- `examples/income_distribution_us.py`: Analyse benefit distribution by decile +- `examples/employment_income_variation_uk.py`: Model employment income phase-outs +- `examples/policy_change_uk.py`: Analyse policy reform impacts + +## Installation + +```bash +pip install policyengine +``` + +## Features + +- **Multi-country support**: UK and US tax-benefit systems +- **Representative microdata**: Load FRS, CPS, or create custom scenarios +- **Policy reforms**: Parametric reforms with date-bound parameter values +- **Distributional analysis**: Aggregate statistics by income decile, demographics +- **Entity mapping**: Automatic mapping between person, household, tax unit levels +- **Visualisation**: PolicyEngine-branded charts with Plotly + +## Key concepts + +### Datasets + +Datasets contain microdata at entity level (person, household, tax unit). Load representative data or create custom scenarios: + +```python +from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset + +dataset = PolicyEngineUKDataset( + name="Representative data", + filepath="./data/frs_2023_24_year_2026.h5", + year=2026, +) +dataset.load() +``` + +### Simulations + +Simulations apply tax-benefit models to datasets: + +```python +from policyengine.core import Simulation +from policyengine.tax_benefit_models.uk import uk_latest + +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Access calculated variables +output = simulation.output_dataset.data +print(output.household[["household_net_income", "household_benefits"]]) +``` + +### Outputs + +Extract insights with aggregate statistics: + +```python +from policyengine.outputs.aggregate import Aggregate, AggregateType + +# Mean income in top decile +agg = Aggregate( + simulation=simulation, + variable="household_net_income", + aggregate_type=AggregateType.MEAN, + filter_variable="household_net_income", + quantile=10, + quantile_eq=10, +) +agg.run() +print(f"Top decile mean income: £{agg.result:,.0f}") +``` + +### Policy reforms + +Apply parametric reforms: + +```python +from policyengine.core import Policy, Parameter, ParameterValue +import datetime + +parameter = Parameter( + name="gov.hmrc.income_tax.allowances.personal_allowance.amount", + tax_benefit_model_version=uk_latest, + data_type=float, +) + +policy = Policy( + name="Increase personal allowance", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=15000, + ) + ], +) + +# Run reform simulation +reform_sim = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + policy=policy, +) +reform_sim.run() +``` + +## Country models + +### UK + +Three entity levels: +- **Person**: Individual with income and demographics +- **Benunit**: Benefit unit (single person or couple with children) +- **Household**: Residence unit + +Key benefits: Universal Credit, Child Benefit, Pension Credit +Key taxes: Income tax, National Insurance + +### US + +Six entity levels: +- **Person**: Individual +- **Tax unit**: Federal tax filing unit +- **SPM unit**: Supplemental Poverty Measure unit +- **Family**: Census family definition +- **Marital unit**: Married couple or single person +- **Household**: Residence unit + +Key benefits: SNAP, TANF, EITC, CTC, SSI, Social Security +Key taxes: Federal income tax, payroll tax + +## Contributing + +See [CONTRIBUTING.md](CONTRIBUTING.md) for development setup and guidelines. + +## License + +AGPL-3.0 diff --git a/changelog_entry.yaml b/changelog_entry.yaml index e69de29b..4c132743 100644 --- a/changelog_entry.yaml +++ b/changelog_entry.yaml @@ -0,0 +1,4 @@ +- bump: minor + changes: + - Just basemodels, no sqlmodels. + - Clean, working analysis at both household and macro level for uk and us. diff --git a/docs/core-concepts.md b/docs/core-concepts.md new file mode 100644 index 00000000..bdba1f39 --- /dev/null +++ b/docs/core-concepts.md @@ -0,0 +1,443 @@ +# Core concepts + +PolicyEngine.py is a Python package for tax-benefit microsimulation analysis. It provides a unified interface for running policy simulations, analysing distributional impacts, and visualising results across different countries. + +## Architecture overview + +The package is organised around several core concepts: + +- **Tax-benefit models**: Country-specific implementations (UK, US) that define tax and benefit rules +- **Datasets**: Microdata representing populations at entity level (person, household, etc.) +- **Simulations**: Execution environments that apply tax-benefit models to datasets +- **Outputs**: Analysis tools for extracting insights from simulation results +- **Policies**: Parametric reforms that modify tax-benefit system parameters + +## Tax-benefit models + +Tax-benefit models define the rules and calculations for a country's tax and benefit system. Each model version contains: + +- **Variables**: Calculated values (e.g., income tax, universal credit) +- **Parameters**: System settings (e.g., personal allowance, benefit rates) +- **Parameter values**: Time-bound values for parameters + +### Using a tax-benefit model + +```python +from policyengine.tax_benefit_models.uk import uk_latest +from policyengine.tax_benefit_models.us import us_latest + +# UK model includes variables like: +# - income_tax, national_insurance, universal_credit +# - Parameters like personal allowance, NI thresholds + +# US model includes variables like: +# - income_tax, payroll_tax, eitc, ctc, snap +# - Parameters like standard deduction, EITC rates +``` + +## Datasets + +Datasets contain microdata representing a population. Each dataset has: + +- **Entity-level data**: Separate dataframes for person, household, and other entities +- **Weights**: Survey weights for population representation +- **Join keys**: Relationships between entities (e.g., which household each person belongs to) + +### Dataset structure + +```python +from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset + +dataset = PolicyEngineUKDataset( + name="FRS 2023-24", + description="Family Resources Survey microdata", + filepath="./data/frs_2023_24_year_2026.h5", + year=2026, +) + +# Access entity-level data +person_data = dataset.data.person # MicroDataFrame +household_data = dataset.data.household +benunit_data = dataset.data.benunit # Benefit unit (UK only) +``` + +### Creating custom datasets + +You can create custom datasets for scenario analysis: + +```python +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset, UKYearData + +# Create person data +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": [0, 1, 2], + "person_household_id": [0, 0, 1], + "person_benunit_id": [0, 0, 1], + "age": [35, 8, 40], + "employment_income": [30000, 0, 50000], + "person_weight": [1.0, 1.0, 1.0], + }), + weights="person_weight" +) + +# Create household data +household_df = MicroDataFrame( + pd.DataFrame({ + "household_id": [0, 1], + "region": ["LONDON", "SOUTH_EAST"], + "rent": [15000, 12000], + "household_weight": [1.0, 1.0], + }), + weights="household_weight" +) + +# Create benunit data +benunit_df = MicroDataFrame( + pd.DataFrame({ + "benunit_id": [0, 1], + "would_claim_uc": [True, True], + "benunit_weight": [1.0, 1.0], + }), + weights="benunit_weight" +) + +dataset = PolicyEngineUKDataset( + name="Custom scenario", + description="Single parent vs single adult", + filepath="./custom.h5", + year=2026, + data=UKYearData( + person=person_df, + household=household_df, + benunit=benunit_df, + ) +) +``` + +## Simulations + +Simulations apply tax-benefit models to datasets, calculating all variables for the specified year. + +### Running a simulation + +```python +from policyengine.core import Simulation +from policyengine.tax_benefit_models.uk import uk_latest + +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Access output data +output_person = simulation.output_dataset.data.person +output_household = simulation.output_dataset.data.household + +# Check calculated variables +print(output_household[["household_id", "household_net_income", "household_tax"]]) +``` + +### Accessing calculated variables + +After running a simulation, you can access the calculated variables from the output dataset: + +```python +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Access specific variables +output = simulation.output_dataset.data +person_data = output.person[["person_id", "age", "employment_income", "income_tax"]] +household_data = output.household[["household_id", "household_net_income"]] +benunit_data = output.benunit[["benunit_id", "universal_credit", "child_benefit"]] +``` + +## Policies + +Policies modify tax-benefit system parameters through parametric reforms. + +### Creating a policy + +```python +from policyengine.core import Policy, Parameter, ParameterValue +import datetime + +# Define parameter to modify +parameter = Parameter( + name="gov.hmrc.income_tax.allowances.personal_allowance.amount", + tax_benefit_model_version=uk_latest, + description="Personal allowance for income tax", + data_type=float, +) + +# Set new value +parameter_value = ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=15000, # Increase from ~£12,570 to £15,000 +) + +policy = Policy( + name="Increased personal allowance", + description="Raises personal allowance to £15,000", + parameter_values=[parameter_value], +) +``` + +### Running a reform simulation + +```python +# Baseline simulation +baseline = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +baseline.run() + +# Reform simulation +reform = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + policy=policy, +) +reform.run() +``` + +## Outputs + +Output classes provide structured analysis of simulation results. + +### Aggregate + +Calculate aggregate statistics (sum, mean, count) for any variable: + +```python +from policyengine.outputs.aggregate import Aggregate, AggregateType + +# Total universal credit spending +agg = Aggregate( + simulation=simulation, + variable="universal_credit", + aggregate_type=AggregateType.SUM, + entity="benunit", # Map to benunit level +) +agg.run() +print(f"Total UC spending: £{agg.result / 1e9:.1f}bn") + +# Mean household income in top decile +agg = Aggregate( + simulation=simulation, + variable="household_net_income", + aggregate_type=AggregateType.MEAN, + filter_variable="household_net_income", + quantile=10, + quantile_eq=10, # 10th decile +) +agg.run() +print(f"Mean income in top decile: £{agg.result:,.0f}") +``` + +### ChangeAggregate + +Analyse impacts of policy reforms: + +```python +from policyengine.outputs.change_aggregate import ChangeAggregate, ChangeAggregateType + +# Count winners and losers +winners = ChangeAggregate( + baseline_simulation=baseline, + reform_simulation=reform, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=1, # Gain at least £1 +) +winners.run() +print(f"Winners: {winners.result / 1e6:.1f}m households") + +losers = ChangeAggregate( + baseline_simulation=baseline, + reform_simulation=reform, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_leq=-1, # Lose at least £1 +) +losers.run() +print(f"Losers: {losers.result / 1e6:.1f}m households") + +# Revenue impact +revenue = ChangeAggregate( + baseline_simulation=baseline, + reform_simulation=reform, + variable="household_tax", + aggregate_type=ChangeAggregateType.SUM, +) +revenue.run() +print(f"Revenue change: £{revenue.result / 1e9:.1f}bn") +``` + +## Entity mapping + +The package automatically handles entity mapping when variables are defined at different entity levels. + +### Entity hierarchy + +**UK:** +``` +household + └── benunit (benefit unit) + └── person +``` + +**US:** +``` +household + ├── tax_unit + ├── spm_unit + ├── family + └── marital_unit + └── person +``` + +### Automatic mapping + +When you request a person-level variable (like `ssi`) at household level, the package: +1. Sums person-level values within each household (aggregation) +2. Returns household-level data with proper weights + +```python +# SSI is defined at person level, but we want household-level totals +agg = Aggregate( + simulation=simulation, + variable="ssi", # Person-level variable + entity="household", # Target household level + aggregate_type=AggregateType.SUM, +) +# Internally maps person → household by summing SSI for all persons in each household +``` + +When you request a household-level variable at person level: +1. Replicates household values to all persons in that household (expansion) + +## Visualisation + +The package includes utilities for creating PolicyEngine-branded visualisations: + +```python +from policyengine.utils.plotting import format_fig, COLORS +import plotly.graph_objects as go + +fig = go.Figure() +fig.add_trace(go.Scatter(x=[1, 2, 3], y=[4, 5, 6])) + +format_fig( + fig, + title="My chart", + xaxis_title="X axis", + yaxis_title="Y axis", + height=600, + width=800, +) +fig.show() +``` + +### Brand colours + +```python +COLORS = { + "primary": "#319795", # Teal + "success": "#22C55E", # Green + "warning": "#FEC601", # Yellow + "error": "#EF4444", # Red + "info": "#1890FF", # Blue + "blue_secondary": "#026AA2", # Dark blue + "gray": "#667085", # Gray +} +``` + +## Common workflows + +### 1. Analyse employment income variation + +See `examples/employment_income_variation_uk.py` for a complete example of: +- Creating custom datasets with varied parameters +- Running single simulations +- Extracting results with filters +- Visualising benefit phase-outs + +### 2. Policy reform analysis + +See `examples/policy_change_uk.py` for: +- Applying parametric reforms +- Comparing baseline and reform +- Analysing winners/losers by decile +- Calculating revenue impacts + +### 3. Distributional analysis + +See `examples/income_distribution_us.py` for: +- Loading representative microdata +- Calculating statistics by income decile +- Mapping variables across entity levels +- Creating interactive visualisations + +## Best practices + +### Creating custom datasets + +1. **Always set would_claim variables**: Benefits won't be claimed unless explicitly enabled + ```python + "would_claim_uc": [True] * n_households + ``` + +2. **Set disability variables explicitly**: Prevents random UC spikes from LCWRA element + ```python + "is_disabled_for_benefits": [False] * n_people + "uc_limited_capability_for_WRA": [False] * n_people + ``` + +3. **Include required join keys**: Person data needs entity membership + ```python + "person_household_id": household_ids + "person_benunit_id": benunit_ids # UK only + ``` + +4. **Set required household fields**: Vary by country + ```python + # UK + "region": ["LONDON"] * n_households + "tenure_type": ["RENT_PRIVATELY"] * n_households + + # US + "state_code": ["CA"] * n_households + ``` + +### Performance optimisation + +1. **Single simulation for variations**: Create all scenarios in one dataset, run once +2. **Custom variable selection**: Only calculate needed variables +3. **Filter efficiently**: Use quantile filters for decile analysis +4. **Parallel analysis**: Multiple Aggregate calls can run independently + +### Data integrity + +1. **Check weights**: Ensure weights sum to expected population +2. **Validate join keys**: All persons should link to valid households +3. **Review output ranges**: Check calculated values are reasonable +4. **Test edge cases**: Zero income, high income, disabled, elderly + +## Next steps + +- See `examples/` for complete working examples +- Review country-specific documentation: + - [UK tax-benefit model](country-models-uk.md) + - [US tax-benefit model](country-models-us.md) +- Explore the API reference for detailed class documentation diff --git a/docs/country-models-uk.md b/docs/country-models-uk.md new file mode 100644 index 00000000..27d7dae7 --- /dev/null +++ b/docs/country-models-uk.md @@ -0,0 +1,340 @@ +# UK tax-benefit model + +The UK tax-benefit model implements the United Kingdom's tax and benefit system using PolicyEngine UK as the underlying calculation engine. + +## Entity structure + +The UK model uses three entity levels: + +``` +household + └── benunit (benefit unit) + └── person +``` + +### Person + +Individual people with demographic and income characteristics. + +**Key variables:** +- `age`: Person's age in years +- `employment_income`: Annual employment income +- `self_employment_income`: Annual self-employment income +- `pension_income`: Annual pension income +- `savings_interest_income`: Annual interest from savings +- `dividend_income`: Annual dividend income +- `income_tax`: Total income tax paid +- `national_insurance`: Total NI contributions +- `is_disabled_for_benefits`: Whether disabled for benefit purposes + +### Benunit (benefit unit) + +The unit for benefit assessment. Usually a single person or a couple with dependent children. + +**Key variables:** +- `universal_credit`: Annual UC payment +- `child_benefit`: Annual child benefit +- `working_tax_credit`: Annual WTC (legacy system) +- `child_tax_credit`: Annual CTC (legacy system) +- `pension_credit`: Annual pension credit +- `income_support`: Annual income support +- `housing_benefit`: Annual housing benefit +- `council_tax_support`: Annual council tax support + +**Important flags:** +- `would_claim_uc`: Must be True to claim UC +- `would_claim_WTC`: Must be True to claim WTC +- `would_claim_CTC`: Must be True to claim CTC +- `would_claim_IS`: Must be True to claim IS +- `would_claim_pc`: Must be True to claim pension credit +- `would_claim_child_benefit`: Must be True to claim child benefit +- `would_claim_housing_benefit`: Must be True to claim HB + +### Household + +The residence unit, typically sharing accommodation. + +**Key variables:** +- `household_net_income`: Total household net income +- `hbai_household_net_income`: HBAI-equivalised net income +- `household_benefits`: Total benefits received +- `household_tax`: Total tax paid +- `household_market_income`: Total market income + +**Required fields:** +- `region`: UK region (e.g., "LONDON", "SOUTH_EAST") +- `tenure_type`: Housing tenure (e.g., "RENT_PRIVATELY", "OWNED_OUTRIGHT") +- `rent`: Annual rent paid +- `council_tax`: Annual council tax + +## Using the UK model + +### Loading representative data + +```python +from policyengine.tax_benefit_models.uk import PolicyEngineUKDataset + +dataset = PolicyEngineUKDataset( + name="FRS 2023-24", + description="Family Resources Survey microdata", + filepath="./data/frs_2023_24_year_2026.h5", + year=2026, +) + +print(f"People: {len(dataset.data.person):,}") +print(f"Benefit units: {len(dataset.data.benunit):,}") +print(f"Households: {len(dataset.data.household):,}") +``` + +### Creating custom scenarios + +```python +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.uk import UKYearData + +# Single parent with 2 children +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": [0, 1, 2], + "person_benunit_id": [0, 0, 0], + "person_household_id": [0, 0, 0], + "age": [35, 8, 5], + "employment_income": [25000, 0, 0], + "person_weight": [1.0, 1.0, 1.0], + "is_disabled_for_benefits": [False, False, False], + "uc_limited_capability_for_WRA": [False, False, False], + }), + weights="person_weight" +) + +benunit_df = MicroDataFrame( + pd.DataFrame({ + "benunit_id": [0], + "benunit_weight": [1.0], + "would_claim_uc": [True], + "would_claim_child_benefit": [True], + "would_claim_WTC": [True], + "would_claim_CTC": [True], + }), + weights="benunit_weight" +) + +household_df = MicroDataFrame( + pd.DataFrame({ + "household_id": [0], + "household_weight": [1.0], + "region": ["LONDON"], + "rent": [15000], # £1,250/month + "council_tax": [2000], + "tenure_type": ["RENT_PRIVATELY"], + }), + weights="household_weight" +) + +dataset = PolicyEngineUKDataset( + name="Single parent scenario", + description="One adult, two children", + filepath="./single_parent.h5", + year=2026, + data=UKYearData( + person=person_df, + benunit=benunit_df, + household=household_df, + ) +) +``` + +### Running a simulation + +```python +from policyengine.core import Simulation +from policyengine.tax_benefit_models.uk import uk_latest + +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, +) +simulation.run() + +# Check results +output = simulation.output_dataset.data +print(output.household[["household_net_income", "household_benefits", "household_tax"]]) +``` + +## Key parameters + +### Income tax + +- `gov.hmrc.income_tax.allowances.personal_allowance.amount`: Personal allowance (£12,570 in 2024-25) +- `gov.hmrc.income_tax.rates.uk[0].rate`: Basic rate (20%) +- `gov.hmrc.income_tax.rates.uk[1].rate`: Higher rate (40%) +- `gov.hmrc.income_tax.rates.uk[2].rate`: Additional rate (45%) +- `gov.hmrc.income_tax.rates.uk[0].threshold`: Basic rate threshold (£50,270) +- `gov.hmrc.income_tax.rates.uk[1].threshold`: Higher rate threshold (£125,140) + +### National insurance + +- `gov.hmrc.national_insurance.class_1.main.primary_threshold`: Primary threshold (£12,570) +- `gov.hmrc.national_insurance.class_1.main.upper_earnings_limit`: Upper earnings limit (£50,270) +- `gov.hmrc.national_insurance.class_1.main.rate`: Main rate (12% below UEL, 2% above) + +### Universal credit + +- `gov.dwp.universal_credit.elements.standard_allowance.single_adult`: Standard allowance for single adult (£334.91/month in 2024-25) +- `gov.dwp.universal_credit.elements.child.first_child`: First child element (£333.33/month) +- `gov.dwp.universal_credit.elements.child.subsequent_child`: Subsequent children (£287.92/month each) +- `gov.dwp.universal_credit.means_test.reduction_rate`: Taper rate (55%) +- `gov.dwp.universal_credit.means_test.earned_income.disregard`: Work allowance + +### Child benefit + +- `gov.hmrc.child_benefit.rates.eldest_child`: First child rate (£25.60/week) +- `gov.hmrc.child_benefit.rates.additional_child`: Additional children (£16.95/week each) +- `gov.hmrc.child_benefit.income_tax_charge.threshold`: HICBC threshold (£60,000) + +## Common policy reforms + +### Increasing personal allowance + +```python +from policyengine.core import Policy, Parameter, ParameterValue +import datetime + +parameter = Parameter( + name="gov.hmrc.income_tax.allowances.personal_allowance.amount", + tax_benefit_model_version=uk_latest, + description="Personal allowance", + data_type=float, +) + +policy = Policy( + name="Increase personal allowance to £15,000", + description="Raises personal allowance from £12,570 to £15,000", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=15000, + ) + ], +) +``` + +### Adjusting UC taper rate + +```python +parameter = Parameter( + name="gov.dwp.universal_credit.means_test.reduction_rate", + tax_benefit_model_version=uk_latest, + description="UC taper rate", + data_type=float, +) + +policy = Policy( + name="Reduce UC taper to 50%", + description="Lowers taper rate from 55% to 50%", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=0.50, # 50% + ) + ], +) +``` + +### Abolishing two-child limit + +```python +# Set subsequent child element equal to first child +parameter = Parameter( + name="gov.dwp.universal_credit.elements.child.subsequent_child", + tax_benefit_model_version=uk_latest, + description="UC subsequent child element", + data_type=float, +) + +policy = Policy( + name="Abolish two-child limit", + description="Sets subsequent child element equal to first child", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2026, 1, 1), + end_date=datetime.date(2026, 12, 31), + value=333.33, # Match first child rate + ) + ], +) +``` + +## Regional variations + +The UK model accounts for regional differences: + +- **Council tax**: Varies by local authority +- **Rent levels**: Regional housing markets +- **Scottish income tax**: Different rates and thresholds for Scottish taxpayers + +### Regions + +Valid region values: +- `LONDON` +- `SOUTH_EAST` +- `SOUTH_WEST` +- `EAST_OF_ENGLAND` +- `WEST_MIDLANDS` +- `EAST_MIDLANDS` +- `YORKSHIRE` +- `NORTH_WEST` +- `NORTH_EAST` +- `WALES` +- `SCOTLAND` +- `NORTHERN_IRELAND` + +## Data sources + +The UK model can use several data sources: + +1. **Family Resources Survey (FRS)**: Official UK household survey + - ~19,000 households + - Detailed income and benefit receipt + - Published annually + +2. **Enhanced FRS**: Uprated and enhanced version + - Calibrated to population totals + - Additional imputed variables + - Multiple projection years + +3. **Custom datasets**: User-created scenarios + - Full control over household composition + - Exact income levels + - Specific benefit claiming patterns + +## Validation + +When creating custom datasets, validate: + +1. **Would claim flags**: All set to True +2. **Disability flags**: Set explicitly (not random) +3. **Join keys**: Person data links to benunits and households +4. **Required fields**: Region, tenure_type set correctly +5. **Weights**: Sum to expected values +6. **Income ranges**: Realistic values + +## Examples + +See working examples in the `examples/` directory: + +- `employment_income_variation_uk.py`: Vary employment income, analyse benefit phase-outs +- `policy_change_uk.py`: Apply reforms, analyse winners/losers +- `income_bands_uk.py`: Create income band scenarios + +## References + +- PolicyEngine UK documentation: https://policyengine.github.io/policyengine-uk/ +- UK tax-benefit system: https://www.gov.uk/browse/benefits +- HBAI methodology: https://www.gov.uk/government/statistics/households-below-average-income-for-financial-years-ending-1995-to-2023 diff --git a/docs/country-models-us.md b/docs/country-models-us.md new file mode 100644 index 00000000..927966ee --- /dev/null +++ b/docs/country-models-us.md @@ -0,0 +1,413 @@ +# US tax-benefit model + +The US tax-benefit model implements the United States federal tax and benefit system using PolicyEngine US as the underlying calculation engine. + +## Entity structure + +The US model uses a more complex entity hierarchy: + +``` +household + ├── tax_unit (federal tax filing unit) + ├── spm_unit (Supplemental Poverty Measure unit) + ├── family (Census definition) + └── marital_unit (married couple or single person) + └── person +``` + +### Person + +Individual people with demographic and income characteristics. + +**Key variables:** +- `age`: Person's age in years +- `employment_income`: Annual employment income +- `self_employment_income`: Annual self-employment income +- `social_security`: Annual Social Security benefits +- `ssi`: Annual Supplemental Security Income +- `medicaid`: Annual Medicaid value +- `medicare`: Annual Medicare value +- `unemployment_compensation`: Annual unemployment benefits + +### Tax unit + +The federal tax filing unit (individual or married filing jointly). + +**Key variables:** +- `income_tax`: Federal income tax liability +- `employee_payroll_tax`: Employee payroll tax (FICA) +- `eitc`: Earned Income Tax Credit +- `ctc`: Child Tax Credit +- `income_tax_before_credits`: Tax before credits + +### SPM unit + +The Supplemental Poverty Measure unit used for SNAP and other means-tested benefits. + +**Key variables:** +- `snap`: Annual SNAP (food stamps) benefits +- `tanf`: Annual TANF (cash assistance) benefits +- `spm_unit_net_income`: SPM net income +- `spm_unit_size`: Number of people in unit + +### Family + +Census definition of family (related individuals). + +**Key variables:** +- `family_id`: Family identifier +- `family_weight`: Survey weight + +### Marital unit + +Married couple or single person. + +**Key variables:** +- `marital_unit_id`: Marital unit identifier +- `marital_unit_weight`: Survey weight + +### Household + +The residence unit. + +**Key variables:** +- `household_net_income`: Total household net income +- `household_benefits`: Total benefits received +- `household_tax`: Total tax paid +- `household_market_income`: Total market income before taxes and transfers + +**Required fields:** +- `state_code`: State (e.g., "CA", "NY", "TX") + +## Using the US model + +### Loading representative data + +```python +from policyengine.tax_benefit_models.us import PolicyEngineUSDataset + +dataset = PolicyEngineUSDataset( + name="Enhanced CPS 2024", + description="Enhanced Current Population Survey microdata", + filepath="./data/enhanced_cps_2024_year_2024.h5", + year=2024, +) + +print(f"People: {len(dataset.data.person):,}") +print(f"Tax units: {len(dataset.data.tax_unit):,}") +print(f"SPM units: {len(dataset.data.spm_unit):,}") +print(f"Households: {len(dataset.data.household):,}") +``` + +### Creating custom scenarios + +```python +import pandas as pd +from microdf import MicroDataFrame +from policyengine.tax_benefit_models.us import USYearData + +# Married couple with 2 children +person_df = MicroDataFrame( + pd.DataFrame({ + "person_id": [0, 1, 2, 3], + "person_household_id": [0, 0, 0, 0], + "person_tax_unit_id": [0, 0, 0, 0], + "person_spm_unit_id": [0, 0, 0, 0], + "person_family_id": [0, 0, 0, 0], + "person_marital_unit_id": [0, 0, 1, 2], + "age": [35, 33, 8, 5], + "employment_income": [60000, 40000, 0, 0], + "person_weight": [1.0, 1.0, 1.0, 1.0], + }), + weights="person_weight" +) + +tax_unit_df = MicroDataFrame( + pd.DataFrame({ + "tax_unit_id": [0], + "tax_unit_weight": [1.0], + }), + weights="tax_unit_weight" +) + +spm_unit_df = MicroDataFrame( + pd.DataFrame({ + "spm_unit_id": [0], + "spm_unit_weight": [1.0], + }), + weights="spm_unit_weight" +) + +family_df = MicroDataFrame( + pd.DataFrame({ + "family_id": [0], + "family_weight": [1.0], + }), + weights="family_weight" +) + +marital_unit_df = MicroDataFrame( + pd.DataFrame({ + "marital_unit_id": [0, 1, 2], + "marital_unit_weight": [1.0, 1.0, 1.0], + }), + weights="marital_unit_weight" +) + +household_df = MicroDataFrame( + pd.DataFrame({ + "household_id": [0], + "household_weight": [1.0], + "state_code": ["CA"], + }), + weights="household_weight" +) + +dataset = PolicyEngineUSDataset( + name="Married couple scenario", + description="Two adults, two children", + filepath="./married_couple.h5", + year=2024, + data=USYearData( + person=person_df, + tax_unit=tax_unit_df, + spm_unit=spm_unit_df, + family=family_df, + marital_unit=marital_unit_df, + household=household_df, + ) +) +``` + +### Running a simulation + +```python +from policyengine.core import Simulation +from policyengine.tax_benefit_models.us import us_latest + +simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=us_latest, +) +simulation.run() + +# Check results +output = simulation.output_dataset.data +print(output.household[["household_net_income", "household_benefits", "household_tax"]]) +``` + +## Key parameters + +### Income tax + +- `gov.irs.income.standard_deduction.joint`: Standard deduction (married filing jointly) +- `gov.irs.income.standard_deduction.single`: Standard deduction (single) +- `gov.irs.income.bracket.rates[0]`: 10% bracket rate +- `gov.irs.income.bracket.rates[1]`: 12% bracket rate +- `gov.irs.income.bracket.rates[2]`: 22% bracket rate +- `gov.irs.income.bracket.thresholds.joint[0]`: 10% bracket threshold (MFJ) +- `gov.irs.income.bracket.thresholds.single[0]`: 10% bracket threshold (single) + +### Payroll tax + +- `gov.ssa.payroll.rate.employee`: Employee OASDI rate (6.2%) +- `gov.medicare.payroll.rate`: Medicare rate (1.45%) +- `gov.ssa.payroll.cap`: OASDI wage base ($168,600 in 2024) + +### Child Tax Credit + +- `gov.irs.credits.ctc.amount.base`: Base CTC amount ($2,000 per child) +- `gov.irs.credits.ctc.refundable.amount.max`: Maximum refundable amount ($1,700) +- `gov.irs.credits.ctc.phase_out.threshold.joint`: Phase-out threshold (MFJ) +- `gov.irs.credits.ctc.phase_out.rate`: Phase-out rate + +### Earned Income Tax Credit + +- `gov.irs.credits.eitc.max[0]`: Maximum EITC (0 children) +- `gov.irs.credits.eitc.max[1]`: Maximum EITC (1 child) +- `gov.irs.credits.eitc.max[2]`: Maximum EITC (2 children) +- `gov.irs.credits.eitc.max[3]`: Maximum EITC (3+ children) +- `gov.irs.credits.eitc.phase_out.start[0]`: Phase-out start (0 children) +- `gov.irs.credits.eitc.phase_out.rate[0]`: Phase-out rate (0 children) + +### SNAP + +- `gov.usda.snap.normal_allotment.max[1]`: Maximum benefit (1 person) +- `gov.usda.snap.normal_allotment.max[2]`: Maximum benefit (2 people) +- `gov.usda.snap.income_limit.net`: Net income limit (100% FPL) +- `gov.usda.snap.income_deduction.earned.rate`: Earned income deduction rate (20%) + +## Common policy reforms + +### Increasing standard deduction + +```python +from policyengine.core import Policy, Parameter, ParameterValue +import datetime + +parameter = Parameter( + name="gov.irs.income.standard_deduction.single", + tax_benefit_model_version=us_latest, + description="Standard deduction (single)", + data_type=float, +) + +policy = Policy( + name="Increase standard deduction to $20,000", + description="Raises single standard deduction from $14,600 to $20,000", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2024, 1, 1), + end_date=datetime.date(2024, 12, 31), + value=20000, + ) + ], +) +``` + +### Expanding Child Tax Credit + +```python +parameter = Parameter( + name="gov.irs.credits.ctc.amount.base", + tax_benefit_model_version=us_latest, + description="Base CTC amount", + data_type=float, +) + +policy = Policy( + name="Increase CTC to $3,000", + description="Expands CTC from $2,000 to $3,000 per child", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2024, 1, 1), + end_date=datetime.date(2024, 12, 31), + value=3000, + ) + ], +) +``` + +### Making CTC fully refundable + +```python +parameter = Parameter( + name="gov.irs.credits.ctc.refundable.amount.max", + tax_benefit_model_version=us_latest, + description="Maximum refundable CTC", + data_type=float, +) + +policy = Policy( + name="Fully refundable CTC", + description="Makes entire $2,000 CTC refundable", + parameter_values=[ + ParameterValue( + parameter=parameter, + start_date=datetime.date(2024, 1, 1), + end_date=datetime.date(2024, 12, 31), + value=2000, # Match base amount + ) + ], +) +``` + +## State variations + +The US model includes state-level variations for: + +- **State income tax**: Different rates and structures by state +- **State EITC**: State supplements to federal EITC +- **Medicaid**: State-specific eligibility and benefits +- **TANF**: State-administered cash assistance + +### State codes + +Use two-letter state codes (e.g., "CA", "NY", "TX"). All 50 states plus DC are supported. + +## Entity mapping considerations + +The US model's complex entity structure requires careful attention to entity mapping: + +### Person → Household + +When mapping person-level variables (like `ssi`) to household level, values are summed across all household members: + +```python +agg = Aggregate( + simulation=simulation, + variable="ssi", # Person-level + entity="household", # Aggregate to household + aggregate_type=AggregateType.SUM, +) +# Result: Total SSI for all persons in each household +``` + +### Tax unit → Household + +Tax units nest within households. A household may contain multiple tax units (e.g., adult child filing separately): + +```python +agg = Aggregate( + simulation=simulation, + variable="income_tax", # Tax unit level + entity="household", # Aggregate to household + aggregate_type=AggregateType.SUM, +) +# Result: Total income tax for all tax units in each household +``` + +### Household → Person + +Household variables are replicated to all household members: + +```python +# household_net_income at person level +# Each person in household gets the same household_net_income value +``` + +## Data sources + +The US model can use several data sources: + +1. **Current Population Survey (CPS)**: Census Bureau household survey + - ~60,000 households + - Detailed income and demographic data + - Published annually + +2. **Enhanced CPS**: Calibrated and enhanced version + - Uprated to population totals + - Imputed benefit receipt + - Multiple projection years + +3. **Custom datasets**: User-created scenarios + - Full control over household composition + - Exact income levels + - Specific tax filing scenarios + +## Validation + +When creating custom datasets, validate: + +1. **Entity relationships**: All persons link to valid tax_unit, spm_unit, household +2. **Join key naming**: Use `person_household_id`, `person_tax_unit_id`, etc. +3. **Weights**: Appropriate weights for each entity level +4. **State codes**: Valid two-letter codes +5. **Filing status**: Tax units should reflect actual filing patterns + +## Examples + +See working examples in the `examples/` directory: + +- `income_distribution_us.py`: Analyse benefit distribution by income decile +- `employment_income_variation_us.py`: Vary employment income, analyse phase-outs +- `speedtest_us_simulation.py`: Performance benchmarking + +## References + +- PolicyEngine US documentation: https://policyengine.github.io/policyengine-us/ +- IRS tax information: https://www.irs.gov/forms-pubs +- Benefits.gov: https://www.benefits.gov/ +- SPM methodology: https://www.census.gov/topics/income-poverty/supplemental-poverty-measure.html diff --git a/docs/dev.md b/docs/dev.md index 8b7ded2c..accfa48c 100644 --- a/docs/dev.md +++ b/docs/dev.md @@ -5,4 +5,4 @@ General principles for developing this package's codebase go here. 1. **STRONG** preference for simplicity. Let's make this package as simple as it possibly can be. 2. Remember the goal of this package: to make it easy to create, run, save and analyse PolicyEngine simulations. When considering further features, always ask: can we instead *make it super easy* for people to do this outside the package? 3. Be consistent about property names. `name` = human readable few words you could put as the noun in a sentence without fail. `id` = unique identifier, ideally a UUID. `description` = longer human readable text that describes the object. `created_at` and `updated_at` = timestamps for when the object was created and last updated. -4. Constraints can be good. We should set constraints where they help us simplify the codebase and usage, but not where they unnecessarily block useful functionality. For example: a `Model`, e.g. PolicyEngine UK, is restricted to being basically a set of variables, baseline parameters, and a `f: set of tables -> set of tables` function. \ No newline at end of file +4. Constraints can be good. We should set constraints where they help us simplify the codebase and usage, but not where they unnecessarily block useful functionality. \ No newline at end of file diff --git a/docs/index.md b/docs/index.md index 6e92c81c..dd467d12 100644 --- a/docs/index.md +++ b/docs/index.md @@ -4,7 +4,6 @@ This package aims to simplify and productionise the use of PolicyEngine's tax-be We do this by: * Standardising around a set of core types that let us do policy analysis in an object-oriented way -* Provide a nice clean interface to put instances of these types in a database * Exemplifying this behaviour by using this package in all PolicyEngine's production applications, and analyses In this documentation, we'll walk through the core concepts/types that this package makes available, and how you can use them to run policy analyses at scale. diff --git a/docs/models.md b/docs/models.md deleted file mode 100644 index 010037a3..00000000 --- a/docs/models.md +++ /dev/null @@ -1,47 +0,0 @@ -# PolicyEngine model types guide - -This repository contains several model types that work together to enable policy simulation and analysis. Here's what each does: - -## Core simulation models - -**Model** - The main computational engine that registers tax-benefit systems (UK/US) and provides the simulation function. Contains logic to create seed objects from tax-benefit parameters. - -**Simulation** - Orchestrates policy analysis by combining a model, dataset, policy changes, and dynamic effects. Runs the model's simulation function and stores results. - -**ModelVersion** - Tracks different versions of a model implementation, allowing for comparison across model iterations. - -## Policy configuration - -**Policy** - Defines policy reforms through parameter value changes. Can include a custom simulation modifier function for complex reforms. - -**Dynamic** - Similar to Policy but specifically for dynamic/behavioural responses to policy changes. - -**Parameter** - Represents a single policy parameter (e.g., tax rate, benefit amount) within a model. - -**ParameterValue** - A specific value for a parameter at a given time period. - -**BaselineParameterValue** - Default/baseline values for parameters before any policy changes. - -**BaselineVariable** - Variables in the baseline scenario used for comparison. - -## Data handling - -**Dataset** - Contains the input data (households, people, etc.) for a simulation, with optional versioning and year specification. - -**VersionedDataset** - Manages different versions of datasets over time. - -## Results and reporting - -**Report** - Container for analysis results with timestamp tracking. - -**ReportElement** - Individual components within a report (charts, tables, metrics). - -**Aggregate** - Computes aggregated statistics (sum, mean, count) from simulation results, with optional filtering. - -**AggregateType** - Enum defining the available aggregation functions. - -## Supporting models - -**User** - User account management for the platform. - -**SeedObjects** - Helper class for batch-creating initial database objects when registering a new model. \ No newline at end of file diff --git a/docs/myst.yml b/docs/myst.yml index c8ccc8d5..053152c6 100644 --- a/docs/myst.yml +++ b/docs/myst.yml @@ -11,8 +11,9 @@ project: toc: # Auto-generated by `myst init --write-toc` - file: index.md - - file: quickstart.ipynb - - file: models.md + - file: core-concepts.md + - file: country-models-uk.md + - file: country-models-us.md - file: dev.md site: diff --git a/docs/quickstart.ipynb b/docs/quickstart.ipynb deleted file mode 100644 index 9340cb1a..00000000 --- a/docs/quickstart.ipynb +++ /dev/null @@ -1,2242 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "b5510438", - "metadata": {}, - "source": [ - "# Getting started\n", - "\n", - "In this notebook, we'll walk through how to use the PolicyEngine.py package to run simulations and produce analyses. We'll start with a basic analysis in the UK that doesn't use any databases, and then start saving and loading things into a database.\n", - "\n", - "## Running a baseline simulation\n", - "\n", - "To start, let's run through a simulation of the UK, and create a chart of the distribution of household income." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "7eb9b5a0", - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "\n", - " \n", - " \n", - " \n", - " \n", - " " - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "marker": { - "color": "#319795" - }, - "textfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "textposition": "outside", - "texttemplate": "%{y:,.0f}", - "type": "bar", - "x": [ - "£0", - "£20,000", - "£40,000", - "£60,000", - "£80,000", - "£100,000", - "£150,000", - "£200,000", - "£300,000", - "£500,000" - ], - "y": [ - 6628102.860910795, - 10308039.540624166, - 7153251.306053954, - 4288185.176098487, - 1690702.647548969, - 1320125.7573599513, - 326073.73102501093, - 187608.23132836912, - 63106.63353048405, - 41838.373842805624 - ] - } - ], - "layout": { - "font": { - "color": "#101828", - "family": "Roboto, sans-serif", - "size": 14 - }, - "hoverlabel": { - "bgcolor": "white", - "bordercolor": "#81E6D9", - "font": { - "family": "Roboto Mono, monospace", - "size": 12 - } - }, - "hovermode": "x unified", - "paper_bgcolor": "white", - "plot_bgcolor": "white", - "showlegend": false, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "fillpattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "title": { - "font": { - "color": "#101828", - "family": "Roboto, sans-serif", - "size": 20, - "weight": 500 - }, - "text": "The distribution of household income in the UK" - }, - "xaxis": { - "gridcolor": "#E2E8F0", - "gridwidth": 1, - "linecolor": "#E2E8F0", - "linewidth": 1, - "mirror": false, - "showgrid": true, - "showline": true, - "tickfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "title": { - "font": { - "color": "#6B7280" - }, - "text": "Income range" - }, - "zeroline": true, - "zerolinecolor": "#F2F4F7", - "zerolinewidth": 1 - }, - "yaxis": { - "gridcolor": "#E2E8F0", - "gridwidth": 1, - "linecolor": "#E2E8F0", - "linewidth": 1, - "mirror": false, - "showgrid": true, - "showline": true, - "tickfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "title": { - "font": { - "color": "#6B7280" - }, - "text": "Number of households" - }, - "zeroline": true, - "zerolinecolor": "#F2F4F7", - "zerolinewidth": 1 - } - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "import plotly.graph_objects as go\n", - "\n", - "from policyengine.models import (\n", - " Aggregate,\n", - " Simulation,\n", - " policyengine_uk_latest_version,\n", - " policyengine_uk_model,\n", - ")\n", - "from policyengine.utils.charts import add_fonts, format_figure\n", - "from policyengine.utils.datasets import create_uk_dataset\n", - "\n", - "# Load the dataset\n", - "\n", - "uk_dataset = create_uk_dataset()\n", - "\n", - "# Create and run the simulation\n", - "\n", - "\n", - "sim = Simulation(\n", - " dataset=uk_dataset,\n", - " model=policyengine_uk_model,\n", - " model_version=policyengine_uk_latest_version,\n", - ")\n", - "\n", - "sim.run()\n", - "\n", - "# Extract aggregates for household income ranges\n", - "\n", - "income_ranges = [\n", - " 0,\n", - " 20000,\n", - " 40000,\n", - " 60000,\n", - " 80000,\n", - " 100000,\n", - " 150000,\n", - " 200000,\n", - " 300000,\n", - " 500000,\n", - " 1_000_000,\n", - "]\n", - "aggregates = []\n", - "for i in range(len(income_ranges) - 1):\n", - " aggregates.append(\n", - " Aggregate(\n", - " entity=\"household\",\n", - " variable_name=\"hbai_household_net_income\",\n", - " aggregate_function=\"count\",\n", - " filter_variable_name=\"hbai_household_net_income\",\n", - " filter_variable_geq=income_ranges[i],\n", - " filter_variable_leq=income_ranges[i + 1],\n", - " simulation=sim,\n", - " )\n", - " )\n", - "\n", - "aggregates = Aggregate.run(aggregates)\n", - "\n", - "# Create the bar chart\n", - "\n", - "fig = go.Figure(\n", - " data=[\n", - " go.Bar(\n", - " x=[f\"£{inc:,}\" for inc in income_ranges[:-1]],\n", - " y=[agg.value for agg in aggregates],\n", - " )\n", - " ]\n", - ")\n", - "\n", - "# Apply formatting\n", - "\n", - "format_figure(\n", - " fig,\n", - " title=\"The distribution of household income in the UK\",\n", - " x_title=\"Income range\",\n", - " y_title=\"Number of households\",\n", - ")" - ] - }, - { - "cell_type": "markdown", - "id": "24ba497b", - "metadata": {}, - "source": [ - "So, in this example we introduced a few concepts:\n", - "\n", - "* The `Simulation` object, which represents a full run of a microsimulation model, containing all the information (simulated and input) about a set of people or groups. It takes here a few arguments: a `Dataset`, `Model` and `ModelVersion`.\n", - "* The `Dataset` object, which represents a set of people or groups. Here we used a utility function to create this dataset for the UK, but we later will be able to create these from scratch or pull them from a database.\n", - "* The `Model` object, which represents a particular microsimulation model (essentially defined as a function transforming a dataset to a new dataset). There are two models defined by this package, one for the UK and one for the US. Think of these objects as adapters representing the full microsimulation models. Here, we've taken the pre-defined UK model.\n", - "* The `ModelVersion` object, which represents a particular version of a model. This is useful for tracking changes to the model over time. Here, we used the latest version of the UK model.\n", - "\n", - "## Adding a policy reform\n", - "\n", - "Next, we'll add in a policy reform, and see how that changes the results." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "b40913b2", - "metadata": {}, - "outputs": [ - { - "data": { - "application/vnd.plotly.v1+json": { - "config": { - "plotlyServerURL": "https://plot.ly" - }, - "data": [ - { - "marker": { - "color": "#319795" - }, - "name": "Baseline", - "textfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "textposition": "outside", - "texttemplate": "%{y:,.0f}", - "type": "bar", - "x": [ - "£0", - "£20,000", - "£40,000", - "£60,000", - "£80,000", - "£100,000", - "£150,000", - "£200,000", - "£300,000", - "£500,000" - ], - "y": [ - 6628102.860910795, - 10308039.540624166, - 7153251.306053954, - 4288185.176098487, - 1690702.647548969, - 1320125.7573599513, - 326073.73102501093, - 187608.23132836912, - 63106.63353048405, - 41838.373842805624 - ] - }, - { - "marker": { - "color": "#0EA5E9" - }, - "name": "Reform", - "textfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "textposition": "outside", - "texttemplate": "%{y:,.0f}", - "type": "bar", - "x": [ - "£0", - "£20,000", - "£40,000", - "£60,000", - "£80,000", - "£100,000", - "£150,000", - "£200,000", - "£300,000", - "£500,000" - ], - "y": [ - 6172777.805479924, - 10310058.00384126, - 6911190.799784593, - 4471614.799692215, - 2005466.130918176, - 1471720.3202646417, - 341808.24952948757, - 218180.35939976107, - 63106.63353048405, - 41838.373842805624 - ] - } - ], - "layout": { - "font": { - "color": "#101828", - "family": "Roboto, sans-serif", - "size": 14 - }, - "hoverlabel": { - "bgcolor": "white", - "bordercolor": "#81E6D9", - "font": { - "family": "Roboto Mono, monospace", - "size": 12 - } - }, - "hovermode": "x unified", - "paper_bgcolor": "white", - "plot_bgcolor": "white", - "showlegend": true, - "template": { - "data": { - "bar": [ - { - "error_x": { - "color": "#2a3f5f" - }, - "error_y": { - "color": "#2a3f5f" - }, - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "bar" - } - ], - "barpolar": [ - { - "marker": { - "line": { - "color": "#E5ECF6", - "width": 0.5 - }, - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "barpolar" - } - ], - "carpet": [ - { - "aaxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "baxis": { - "endlinecolor": "#2a3f5f", - "gridcolor": "white", - "linecolor": "white", - "minorgridcolor": "white", - "startlinecolor": "#2a3f5f" - }, - "type": "carpet" - } - ], - "choropleth": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "choropleth" - } - ], - "contour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "contour" - } - ], - "contourcarpet": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "contourcarpet" - } - ], - "heatmap": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmap" - } - ], - "heatmapgl": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "heatmapgl" - } - ], - "histogram": [ - { - "marker": { - "pattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - } - }, - "type": "histogram" - } - ], - "histogram2d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2d" - } - ], - "histogram2dcontour": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "histogram2dcontour" - } - ], - "mesh3d": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "type": "mesh3d" - } - ], - "parcoords": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "parcoords" - } - ], - "pie": [ - { - "automargin": true, - "type": "pie" - } - ], - "scatter": [ - { - "fillpattern": { - "fillmode": "overlay", - "size": 10, - "solidity": 0.2 - }, - "type": "scatter" - } - ], - "scatter3d": [ - { - "line": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatter3d" - } - ], - "scattercarpet": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattercarpet" - } - ], - "scattergeo": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergeo" - } - ], - "scattergl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattergl" - } - ], - "scattermapbox": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scattermapbox" - } - ], - "scatterpolar": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolar" - } - ], - "scatterpolargl": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterpolargl" - } - ], - "scatterternary": [ - { - "marker": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "type": "scatterternary" - } - ], - "surface": [ - { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - }, - "colorscale": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "type": "surface" - } - ], - "table": [ - { - "cells": { - "fill": { - "color": "#EBF0F8" - }, - "line": { - "color": "white" - } - }, - "header": { - "fill": { - "color": "#C8D4E3" - }, - "line": { - "color": "white" - } - }, - "type": "table" - } - ] - }, - "layout": { - "annotationdefaults": { - "arrowcolor": "#2a3f5f", - "arrowhead": 0, - "arrowwidth": 1 - }, - "autotypenumbers": "strict", - "coloraxis": { - "colorbar": { - "outlinewidth": 0, - "ticks": "" - } - }, - "colorscale": { - "diverging": [ - [ - 0, - "#8e0152" - ], - [ - 0.1, - "#c51b7d" - ], - [ - 0.2, - "#de77ae" - ], - [ - 0.3, - "#f1b6da" - ], - [ - 0.4, - "#fde0ef" - ], - [ - 0.5, - "#f7f7f7" - ], - [ - 0.6, - "#e6f5d0" - ], - [ - 0.7, - "#b8e186" - ], - [ - 0.8, - "#7fbc41" - ], - [ - 0.9, - "#4d9221" - ], - [ - 1, - "#276419" - ] - ], - "sequential": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ], - "sequentialminus": [ - [ - 0, - "#0d0887" - ], - [ - 0.1111111111111111, - "#46039f" - ], - [ - 0.2222222222222222, - "#7201a8" - ], - [ - 0.3333333333333333, - "#9c179e" - ], - [ - 0.4444444444444444, - "#bd3786" - ], - [ - 0.5555555555555556, - "#d8576b" - ], - [ - 0.6666666666666666, - "#ed7953" - ], - [ - 0.7777777777777778, - "#fb9f3a" - ], - [ - 0.8888888888888888, - "#fdca26" - ], - [ - 1, - "#f0f921" - ] - ] - }, - "colorway": [ - "#636efa", - "#EF553B", - "#00cc96", - "#ab63fa", - "#FFA15A", - "#19d3f3", - "#FF6692", - "#B6E880", - "#FF97FF", - "#FECB52" - ], - "font": { - "color": "#2a3f5f" - }, - "geo": { - "bgcolor": "white", - "lakecolor": "white", - "landcolor": "#E5ECF6", - "showlakes": true, - "showland": true, - "subunitcolor": "white" - }, - "hoverlabel": { - "align": "left" - }, - "hovermode": "closest", - "mapbox": { - "style": "light" - }, - "paper_bgcolor": "white", - "plot_bgcolor": "#E5ECF6", - "polar": { - "angularaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "radialaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "scene": { - "xaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "yaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - }, - "zaxis": { - "backgroundcolor": "#E5ECF6", - "gridcolor": "white", - "gridwidth": 2, - "linecolor": "white", - "showbackground": true, - "ticks": "", - "zerolinecolor": "white" - } - }, - "shapedefaults": { - "line": { - "color": "#2a3f5f" - } - }, - "ternary": { - "aaxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "baxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - }, - "bgcolor": "#E5ECF6", - "caxis": { - "gridcolor": "white", - "linecolor": "white", - "ticks": "" - } - }, - "title": { - "x": 0.05 - }, - "xaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - }, - "yaxis": { - "automargin": true, - "gridcolor": "white", - "linecolor": "white", - "ticks": "", - "title": { - "standoff": 15 - }, - "zerolinecolor": "white", - "zerolinewidth": 2 - } - } - }, - "title": { - "font": { - "color": "#101828", - "family": "Roboto, sans-serif", - "size": 20, - "weight": 500 - }, - "text": "The distribution of household income in the UK" - }, - "xaxis": { - "gridcolor": "#E2E8F0", - "gridwidth": 1, - "linecolor": "#E2E8F0", - "linewidth": 1, - "mirror": false, - "showgrid": true, - "showline": true, - "tickfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "title": { - "font": { - "color": "#6B7280" - }, - "text": "Income range" - }, - "zeroline": true, - "zerolinecolor": "#F2F4F7", - "zerolinewidth": 1 - }, - "yaxis": { - "gridcolor": "#E2E8F0", - "gridwidth": 1, - "linecolor": "#E2E8F0", - "linewidth": 1, - "mirror": false, - "showgrid": true, - "showline": true, - "tickfont": { - "color": "#6B7280", - "family": "Roboto Mono, monospace", - "size": 11 - }, - "title": { - "font": { - "color": "#6B7280" - }, - "text": "Number of households" - }, - "zeroline": true, - "zerolinecolor": "#F2F4F7", - "zerolinewidth": 1 - } - } - } - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "from datetime import datetime\n", - "\n", - "from policyengine.models import Parameter, ParameterValue, Policy\n", - "\n", - "# Parameter = the parameter to change\n", - "\n", - "personal_allowance = Parameter(\n", - " id=\"gov.hmrc.income_tax.allowances.personal_allowance.amount\",\n", - " model=policyengine_uk_model,\n", - ")\n", - "\n", - "# ParameterValue = the value to set the parameter to, and when to start\n", - "\n", - "personal_allowance_value = ParameterValue(\n", - " parameter=personal_allowance,\n", - " start_date=datetime(2029, 1, 1),\n", - " value=20000,\n", - ")\n", - "\n", - "# Create a policy to increase the personal allowance to £20,000 from 2029-30\n", - "\n", - "policy = Policy(\n", - " name=\"Increase personal allowance to £20,000\",\n", - " description=\"A policy to increase the personal allowance for income tax to £20,000.\",\n", - " parameter_values=[personal_allowance_value],\n", - ")\n", - "\n", - "sim_2 = Simulation(\n", - " dataset=uk_dataset,\n", - " model=policyengine_uk_model,\n", - " model_version=policyengine_uk_latest_version,\n", - " policy=policy, # Pass in the policy here\n", - ")\n", - "\n", - "sim_2.run()\n", - "\n", - "# Extract new aggregates for household income ranges\n", - "\n", - "income_ranges = [\n", - " 0,\n", - " 20000,\n", - " 40000,\n", - " 60000,\n", - " 80000,\n", - " 100000,\n", - " 150000,\n", - " 200000,\n", - " 300000,\n", - " 500000,\n", - " 1_000_000,\n", - "]\n", - "aggregates_2 = []\n", - "for i in range(len(income_ranges) - 1):\n", - " aggregates_2.append(\n", - " Aggregate(\n", - " entity=\"household\",\n", - " variable_name=\"hbai_household_net_income\",\n", - " aggregate_function=\"count\",\n", - " filter_variable_name=\"hbai_household_net_income\",\n", - " filter_variable_geq=income_ranges[i],\n", - " filter_variable_leq=income_ranges[i + 1],\n", - " simulation=sim_2,\n", - " )\n", - " )\n", - "\n", - "aggregates_2 = Aggregate.run(aggregates_2)\n", - "\n", - "# Create the comparative bar chart\n", - "fig = go.Figure(\n", - " data=[\n", - " go.Bar(\n", - " name=\"Baseline\",\n", - " x=[f\"£{inc:,}\" for inc in income_ranges[:-1]],\n", - " y=[agg.value for agg in aggregates],\n", - " ),\n", - " go.Bar(\n", - " name=\"Reform\",\n", - " x=[f\"£{inc:,}\" for inc in income_ranges[:-1]],\n", - " y=[agg.value for agg in aggregates_2],\n", - " ),\n", - " ]\n", - ")\n", - "\n", - "# Apply formatting\n", - "fig = format_figure(\n", - " fig,\n", - " title=\"The distribution of household income in the UK\",\n", - " x_title=\"Income range\",\n", - " y_title=\"Number of households\",\n", - ")\n", - "\n", - "add_fonts()\n", - "\n", - "fig" - ] - }, - { - "cell_type": "markdown", - "id": "6c029d3b", - "metadata": {}, - "source": [ - "In the above example, we created a `Policy` object, which represents a particular policy reform. This object contains a list of `ParameterValue` objects, which represent changes to specific parameters in the model. Here, we changed the personal allowance for income tax to £20,000.\n", - "\n", - "## Bringing in a database\n", - "\n", - "Now, we can upload these objects to a database, and then load them back out again. This is useful for tracking different simulations and policy reforms over time." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "f14c85eb", - "metadata": {}, - "outputs": [], - "source": [ - "from policyengine.database import Database\n", - "\n", - "database = Database(\"postgresql://postgres:postgres@127.0.0.1:54322/postgres\")\n", - "\n", - "# These two lines are not usually needed, but you should use them the first time you set up a new database\n", - "database.reset() # Drop and recreate all tables\n", - "database.register_model_version(\n", - " policyengine_uk_latest_version\n", - ") # Add in the model, model version, parameters and baseline parameter values and variables.\n", - "\n", - "database.set(uk_dataset)\n", - "database.set(policy)\n", - "\n", - "for pv in policy.parameter_values:\n", - " database.set(pv)\n", - "database.set(sim)\n", - "database.set(sim_2)\n", - "for agg in aggregates:\n", - " database.set(agg)\n", - "for agg in aggregates_2:\n", - " database.set(agg)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "2041dfeb", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "Policy(id='26f30afa-77b9-4435-812c-071873e25400', name='Increase personal allowance to £20,000', description='A policy to increase the personal allowance for income tax to £20,000.', parameter_values=[], simulation_modifier=None, created_at=datetime.datetime(2025, 9, 20, 12, 36, 27, 162725), updated_at=datetime.datetime(2025, 9, 20, 12, 36, 27, 162729))" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "database.get(Policy, id=policy.id)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "policyengine", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.13.5" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/docs/visualisation.md b/docs/visualisation.md new file mode 100644 index 00000000..639f12ae --- /dev/null +++ b/docs/visualisation.md @@ -0,0 +1,72 @@ +# Visualisation utilities + +PolicyEngine provides utilities for creating publication-ready charts that follow our visual style guidelines. + +## Formatting plotly figures + +The `format_fig()` function applies PolicyEngine's visual style to plotly figures, ensuring consistency across all analyses and publications. + +```python +from policyengine.utils import format_fig, COLORS +import plotly.graph_objects as go + +# Create your figure +fig = go.Figure() +fig.add_trace(go.Scatter(x=[1, 2, 3], y=[4, 5, 6], name="Data")) + +# Apply PolicyEngine styling +format_fig( + fig, + title="Example chart", + xaxis_title="X axis", + yaxis_title="Y axis", + height=600, + width=800 +) + +fig.show() +``` + +## Visual style principles + +The formatting applies these principles automatically: + +**Colours**: Primary teal (#319795) with semantic colours for different data types (success/green, warning/yellow, error/red, info/blue). Access colours via the `COLORS` dictionary: + +```python +from policyengine.utils import COLORS + +fig.add_trace(go.Scatter( + x=x_data, + y=y_data, + line=dict(color=COLORS["primary"]) +)) +``` + +**Typography**: Inter font family with appropriate sizing (12px for labels, 14px for body text, 16px for titles). + +**Layout**: Clean white background with subtle grey gridlines and appropriate margins (48px) for professional presentation. + +**Clarity**: Data-driven design that prioritises immediate understanding over decoration. + +## Available colours + +```python +COLORS = { + "primary": "#319795", # Teal (main brand colour) + "primary_light": "#E6FFFA", + "primary_dark": "#1D4044", + "success": "#22C55E", # Green (positive changes) + "warning": "#FEC601", # Yellow (cautions) + "error": "#EF4444", # Red (negative changes) + "info": "#1890FF", # Blue (neutral information) + "gray_light": "#F2F4F7", + "gray": "#667085", + "gray_dark": "#101828", + "blue_secondary": "#026AA2", +} +``` + +## Complete example + +See `examples/employment_income_variation.py` for a full demonstration of using `format_fig()` in an analysis workflow. diff --git a/examples/employment_income_variation_uk.py b/examples/employment_income_variation_uk.py new file mode 100644 index 00000000..173c78ff --- /dev/null +++ b/examples/employment_income_variation_uk.py @@ -0,0 +1,395 @@ +"""Example: Vary employment income and plot HBAI household net income. + +This script demonstrates: +1. Creating a custom dataset with a single household template +2. Varying employment income from £0 to £100k +3. Running a single simulation for all variations +4. Using Aggregate with filters to extract results by employment income +5. Visualising the relationship between employment income and net income + +IMPORTANT NOTES FOR CUSTOM DATASETS: +- Always set would_claim_* variables to True, otherwise benefits won't be claimed + even if the household is eligible (they default to random/False) +- Always set disability variables explicitly (is_disabled_for_benefits, uc_limited_capability_for_WRA) + to prevent random UC spikes from LCWRA element (£5,241/year extra if randomly assigned) +- Must include join keys: person_benunit_id, person_household_id in person data +- Required household fields: region, council_tax, rent, tenure_type +- Person-level variables are mapped to household level using weights + +Run: python examples/employment_income_variation.py +""" + +import tempfile +from pathlib import Path + +import pandas as pd +import plotly.graph_objects as go +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest, +) +from policyengine.utils.plotting import COLORS, format_fig + + +def create_dataset_with_varied_employment_income( + employment_incomes: list[float], year: int = 2026 +) -> PolicyEngineUKDataset: + """Create a dataset with one household template, varied by employment income. + + Each household is a single adult with 2 children, paying median UK rent. + Employment income varies across households. + """ + n_households = len(employment_incomes) + n_households * 3 # 1 adult + 2 children per household + + # Create person data - one adult + 2 children per household + person_ids = [] + benunit_ids = [] + household_ids = [] + ages = [] + employment_income = [] + person_weights = [] + is_disabled = [] + limited_capability = [] + + person_id_counter = 0 + for hh_idx in range(n_households): + # Adult + person_ids.append(person_id_counter) + benunit_ids.append(hh_idx) + household_ids.append(hh_idx) + ages.append(35) + employment_income.append(employment_incomes[hh_idx]) + person_weights.append(1.0) + is_disabled.append(False) + limited_capability.append(False) + person_id_counter += 1 + + # Child 1 (age 8) + person_ids.append(person_id_counter) + benunit_ids.append(hh_idx) + household_ids.append(hh_idx) + ages.append(8) + employment_income.append(0.0) + person_weights.append(1.0) + is_disabled.append(False) + limited_capability.append(False) + person_id_counter += 1 + + # Child 2 (age 5) + person_ids.append(person_id_counter) + benunit_ids.append(hh_idx) + household_ids.append(hh_idx) + ages.append(5) + employment_income.append(0.0) + person_weights.append(1.0) + is_disabled.append(False) + limited_capability.append(False) + person_id_counter += 1 + + person_data = { + "person_id": person_ids, + "person_benunit_id": benunit_ids, + "person_household_id": household_ids, + "age": ages, + "employment_income": employment_income, + "person_weight": person_weights, + "is_disabled_for_benefits": is_disabled, + "uc_limited_capability_for_WRA": limited_capability, + } + + # Create benunit data - one per household + benunit_data = { + "benunit_id": list(range(n_households)), + "benunit_weight": [1.0] * n_households, + # Would claim variables - MUST set to True or benefits won't be claimed! + "would_claim_uc": [True] * n_households, + "would_claim_WTC": [True] * n_households, + "would_claim_CTC": [True] * n_households, + "would_claim_IS": [True] * n_households, + "would_claim_pc": [True] * n_households, + "would_claim_child_benefit": [True] * n_households, + "would_claim_housing_benefit": [True] * n_households, + } + + # Create household data - one per employment income level + median_annual_rent = 850 * 12 # £850/month = £10,200/year (median UK rent) + household_data = { + "household_id": list(range(n_households)), + "household_weight": [1.0] * n_households, + "region": ["LONDON"] * n_households, # Required by policyengine-uk + "council_tax": [0.0] * n_households, # Simplified - no council tax + "rent": [median_annual_rent] * n_households, # Median UK rent + "tenure_type": ["RENT_PRIVATELY"] + * n_households, # Required for uprating + } + + # Create MicroDataFrames + person_df = MicroDataFrame( + pd.DataFrame(person_data), weights="person_weight" + ) + benunit_df = MicroDataFrame( + pd.DataFrame(benunit_data), weights="benunit_weight" + ) + household_df = MicroDataFrame( + pd.DataFrame(household_data), weights="household_weight" + ) + + # Create temporary file + tmpdir = tempfile.mkdtemp() + filepath = str(Path(tmpdir) / "employment_income_variation.h5") + + # Create dataset + dataset = PolicyEngineUKDataset( + name="Employment income variation", + description="Single adult household with varying employment income", + filepath=filepath, + year=year, + data=UKYearData( + person=person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + return dataset + + +def run_simulation(dataset: PolicyEngineUKDataset) -> Simulation: + """Run a single simulation for all employment income variations.""" + # Specify additional variables to calculate beyond defaults + variables = { + "household": [ + # Default variables + "household_id", + "household_weight", + "household_net_income", + "hbai_household_net_income", + "household_benefits", + "household_tax", + ], + "person": [ + "person_id", + "benunit_id", + "household_id", + "person_weight", + "employment_income", + "age", + ], + "benunit": [ + "benunit_id", + "benunit_weight", + # Individual benefits (at benunit level) + "universal_credit", + "child_benefit", + "working_tax_credit", + "child_tax_credit", + "pension_credit", + "income_support", + ], + } + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + variables=variables, + ) + simulation.run() + return simulation + + +def extract_results_by_employment_income( + simulation: Simulation, employment_incomes: list[float] +) -> dict: + """Extract HBAI household net income and components for each employment income level. + + Uses Aggregate with filters to extract specific households. + """ + hbai_net_income = [] + household_benefits = [] + household_tax = [] + employment_income_hh = [] + + # Individual benefits + universal_credit = [] + child_benefit = [] + working_tax_credit = [] + child_tax_credit = [] + pension_credit = [] + income_support = [] + + for hh_idx, emp_income in enumerate(employment_incomes): + # Get HBAI household net income + agg = Aggregate( + simulation=simulation, + variable="hbai_household_net_income", + aggregate_type=AggregateType.MEAN, + filter_variable="household_id", + filter_variable_eq=hh_idx, + entity="household", + ) + agg.run() + hbai_net_income.append(agg.result) + + # Get household benefits + agg = Aggregate( + simulation=simulation, + variable="household_benefits", + aggregate_type=AggregateType.MEAN, + filter_variable="household_id", + filter_variable_eq=hh_idx, + entity="household", + ) + agg.run() + household_benefits.append(agg.result) + + # Get individual benefits (at benunit level, but we have 1:1 benunit:household mapping) + for benefit_name, benefit_list in [ + ("universal_credit", universal_credit), + ("child_benefit", child_benefit), + ("working_tax_credit", working_tax_credit), + ("child_tax_credit", child_tax_credit), + ("pension_credit", pension_credit), + ("income_support", income_support), + ]: + agg = Aggregate( + simulation=simulation, + variable=benefit_name, + aggregate_type=AggregateType.MEAN, + filter_variable="benunit_id", + filter_variable_eq=hh_idx, + entity="benunit", + ) + agg.run() + benefit_list.append(agg.result) + + # Get household tax + agg = Aggregate( + simulation=simulation, + variable="household_tax", + aggregate_type=AggregateType.MEAN, + filter_variable="household_id", + filter_variable_eq=hh_idx, + entity="household", + ) + agg.run() + household_tax.append(agg.result) + + # Employment income at household level (just use the input value) + employment_income_hh.append(emp_income) + + return { + "employment_income": employment_incomes, + "hbai_household_net_income": hbai_net_income, + "household_benefits": household_benefits, + "household_tax": household_tax, + "employment_income_hh": employment_income_hh, + "universal_credit": universal_credit, + "child_benefit": child_benefit, + "working_tax_credit": working_tax_credit, + "child_tax_credit": child_tax_credit, + "pension_credit": pension_credit, + "income_support": income_support, + } + + +def visualise_results(results: dict) -> None: + """Create a stacked area chart showing income composition.""" + fig = go.Figure() + + # Calculate net employment income (employment income minus tax) + net_employment = [ + emp - tax + for emp, tax in zip( + results["employment_income_hh"], results["household_tax"] + ) + ] + + # Stack benefits and income components using PolicyEngine colors + components = [ + ("Net employment income", net_employment, COLORS["primary"]), + ( + "Universal Credit", + results["universal_credit"], + COLORS["blue_secondary"], + ), + ("Working Tax Credit", results["working_tax_credit"], COLORS["info"]), + ("Child Tax Credit", results["child_tax_credit"], COLORS["success"]), + ("Child Benefit", results["child_benefit"], COLORS["warning"]), + ("Pension Credit", results["pension_credit"], COLORS["gray"]), + ("Income Support", results["income_support"], COLORS["gray_dark"]), + ] + + for name, values, color in components: + fig.add_trace( + go.Scatter( + x=results["employment_income"], + y=values, + name=name, + mode="lines", + line=dict(width=0.5, color=color), + stackgroup="one", + fillcolor=color, + ) + ) + + # Apply PolicyEngine styling + format_fig( + fig, + title="Household net income composition by employment income", + xaxis_title="Employment income (£)", + yaxis_title="Net income (£)", + show_legend=True, + height=700, + width=1200, + ) + + fig.show() + + +def main(): + """Main execution function.""" + # Create employment income range from £0 to £100k + # Using smaller intervals at lower incomes where the relationship is more interesting + employment_incomes = ( + list(range(0, 20000, 1000)) # £0 to £20k in £1k steps + + list(range(20000, 50000, 2500)) # £20k to £50k in £2.5k steps + + list(range(50000, 100001, 5000)) # £50k to £100k in £5k steps + ) + + print( + f"Creating dataset with {len(employment_incomes)} employment income variations..." + ) + dataset = create_dataset_with_varied_employment_income(employment_incomes) + + print("Running simulation (single run for all variations)...") + simulation = run_simulation(dataset) + + print("Extracting results using aggregate filters...") + results = extract_results_by_employment_income( + simulation, employment_incomes + ) + + print("\nSample results:") + for emp_inc in [0, 25000, 50000, 100000]: + idx = ( + employment_incomes.index(emp_inc) + if emp_inc in employment_incomes + else -1 + ) + if idx >= 0: + print( + f" Employment income £{emp_inc:,}: HBAI net income £{results['hbai_household_net_income'][idx]:,.0f}" + ) + + print("\nGenerating visualisation...") + visualise_results(results) + + +if __name__ == "__main__": + main() diff --git a/examples/employment_income_variation_us.py b/examples/employment_income_variation_us.py new file mode 100644 index 00000000..863d8018 --- /dev/null +++ b/examples/employment_income_variation_us.py @@ -0,0 +1,357 @@ +"""Example: Vary employment income and plot household net income (US). + +This script demonstrates: +1. Creating a custom dataset with a single household template +2. Varying employment income from $0 to $200k +3. Running a single simulation for all variations +4. Using Aggregate with filters to extract results by employment income +5. Visualising the relationship between employment income and net income + +Run: python examples/employment_income_variation_us.py +""" + +import tempfile +from pathlib import Path + +import pandas as pd +import plotly.graph_objects as go +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + USYearData, + us_latest, +) +from policyengine.utils.plotting import COLORS, format_fig + + +def create_dataset_with_varied_employment_income( + employment_incomes: list[float], year: int = 2024 +) -> PolicyEngineUSDataset: + """Create a dataset with one household template, varied by employment income. + + Each household is a single adult with 2 children. + Employment income varies across households. + """ + n_households = len(employment_incomes) + n_households * 3 # 1 adult + 2 children per household + + # Create person data - one adult + 2 children per household + person_ids = [] + household_ids = [] + marital_unit_ids = [] + family_ids = [] + spm_unit_ids = [] + tax_unit_ids = [] + ages = [] + employment_income = [] + person_weights = [] + + person_id_counter = 0 + for hh_idx in range(n_households): + # Adult + person_ids.append(person_id_counter) + household_ids.append(hh_idx) + marital_unit_ids.append(hh_idx) + family_ids.append(hh_idx) + spm_unit_ids.append(hh_idx) + tax_unit_ids.append(hh_idx) + ages.append(35) + employment_income.append(employment_incomes[hh_idx]) + person_weights.append(1000.0) + person_id_counter += 1 + + # Child 1 (age 8) + person_ids.append(person_id_counter) + household_ids.append(hh_idx) + marital_unit_ids.append(hh_idx) + family_ids.append(hh_idx) + spm_unit_ids.append(hh_idx) + tax_unit_ids.append(hh_idx) + ages.append(8) + employment_income.append(0.0) + person_weights.append(1000.0) + person_id_counter += 1 + + # Child 2 (age 5) + person_ids.append(person_id_counter) + household_ids.append(hh_idx) + marital_unit_ids.append(hh_idx) + family_ids.append(hh_idx) + spm_unit_ids.append(hh_idx) + tax_unit_ids.append(hh_idx) + ages.append(5) + employment_income.append(0.0) + person_weights.append(1000.0) + person_id_counter += 1 + + person_data = { + "person_id": person_ids, + "household_id": household_ids, + "marital_unit_id": marital_unit_ids, + "family_id": family_ids, + "spm_unit_id": spm_unit_ids, + "tax_unit_id": tax_unit_ids, + "age": ages, + "employment_income": employment_income, + "person_weight": person_weights, + } + + # Create household data + household_data = { + "household_id": list(range(n_households)), + "state_name": ["CA"] * n_households, # California + "household_weight": [1000.0] * n_households, + } + + # Create group entity data + marital_unit_data = { + "marital_unit_id": list(range(n_households)), + "marital_unit_weight": [1000.0] * n_households, + } + + family_data = { + "family_id": list(range(n_households)), + "family_weight": [1000.0] * n_households, + } + + spm_unit_data = { + "spm_unit_id": list(range(n_households)), + "spm_unit_weight": [1000.0] * n_households, + } + + tax_unit_data = { + "tax_unit_id": list(range(n_households)), + "tax_unit_weight": [1000.0] * n_households, + } + + # Create MicroDataFrames + person_df = MicroDataFrame( + pd.DataFrame(person_data), weights="person_weight" + ) + household_df = MicroDataFrame( + pd.DataFrame(household_data), weights="household_weight" + ) + marital_unit_df = MicroDataFrame( + pd.DataFrame(marital_unit_data), weights="marital_unit_weight" + ) + family_df = MicroDataFrame( + pd.DataFrame(family_data), weights="family_weight" + ) + spm_unit_df = MicroDataFrame( + pd.DataFrame(spm_unit_data), weights="spm_unit_weight" + ) + tax_unit_df = MicroDataFrame( + pd.DataFrame(tax_unit_data), weights="tax_unit_weight" + ) + + # Create temporary file + tmpdir = tempfile.mkdtemp() + filepath = str(Path(tmpdir) / "employment_income_variation_us.h5") + + # Create dataset + dataset = PolicyEngineUSDataset( + name="Employment income variation (US)", + description="Single adult household with 2 children, varying employment income", + filepath=filepath, + year=year, + data=USYearData( + person=person_df, + household=household_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + tax_unit=tax_unit_df, + ), + ) + + return dataset + + +def run_simulation(dataset: PolicyEngineUSDataset) -> Simulation: + """Run a single simulation for all employment income variations.""" + # Specify variables to calculate + variables = { + "household": [ + "household_id", + "household_weight", + "household_net_income", + "household_benefits", + "household_tax", + "household_market_income", + ], + "person": [ + "person_id", + "household_id", + "marital_unit_id", + "family_id", + "spm_unit_id", + "tax_unit_id", + "person_weight", + "employment_income", + "age", + ], + "spm_unit": [ + "spm_unit_id", + "spm_unit_weight", + "snap", + "tanf", + "spm_unit_net_income", + ], + "tax_unit": [ + "tax_unit_id", + "tax_unit_weight", + "income_tax", + "employee_payroll_tax", + "eitc", + "ctc", + ], + "marital_unit": [ + "marital_unit_id", + "marital_unit_weight", + ], + "family": [ + "family_id", + "family_weight", + ], + } + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=us_latest, + variables=variables, + ) + simulation.run() + return simulation + + +def extract_results_by_employment_income( + simulation: Simulation, employment_incomes: list[float] +) -> dict: + """Extract household net income and components for each employment income level. + + Directly accesses output data by row index since we have one household per income level. + """ + import pandas as pd + + # Get output data + household_df = pd.DataFrame(simulation.output_dataset.data.household) + spm_unit_df = pd.DataFrame(simulation.output_dataset.data.spm_unit) + tax_unit_df = pd.DataFrame(simulation.output_dataset.data.tax_unit) + + # Extract results (one row per household/spm_unit/tax_unit) + household_net_income = household_df["household_net_income"].tolist() + household_benefits = household_df["household_benefits"].tolist() + household_tax = household_df["household_tax"].tolist() + + snap = spm_unit_df["snap"].tolist() + tanf = spm_unit_df["tanf"].tolist() + + eitc = tax_unit_df["eitc"].tolist() + ctc = tax_unit_df["ctc"].tolist() + + employment_income_hh = employment_incomes + + return { + "employment_income": employment_incomes, + "household_net_income": household_net_income, + "household_benefits": household_benefits, + "household_tax": household_tax, + "employment_income_hh": employment_income_hh, + "snap": snap, + "tanf": tanf, + "eitc": eitc, + "ctc": ctc, + } + + +def visualise_results(results: dict) -> None: + """Create a stacked area chart showing income composition.""" + fig = go.Figure() + + # Calculate net employment income (employment income minus tax) + net_employment = [ + emp - tax + for emp, tax in zip( + results["employment_income_hh"], results["household_tax"] + ) + ] + + # Stack benefits and income components using PolicyEngine colors + components = [ + ("Net employment income", net_employment, COLORS["primary"]), + ("SNAP", results["snap"], COLORS["blue_secondary"]), + ("TANF", results["tanf"], COLORS["info"]), + ("EITC", results["eitc"], COLORS["success"]), + ("CTC", results["ctc"], COLORS["warning"]), + ] + + for name, values, color in components: + fig.add_trace( + go.Scatter( + x=results["employment_income"], + y=values, + name=name, + mode="lines", + line=dict(width=0.5, color=color), + stackgroup="one", + fillcolor=color, + ) + ) + + # Apply PolicyEngine styling + format_fig( + fig, + title="Household net income composition by employment income", + xaxis_title="Employment income ($)", + yaxis_title="Net income ($)", + show_legend=True, + height=700, + width=1200, + ) + + fig.show() + + +def main(): + """Main execution function.""" + # Create employment income range from $0 to $200k + # Using smaller intervals at lower incomes where the relationship is more interesting + employment_incomes = ( + list(range(0, 40000, 2000)) # $0 to $40k in $2k steps + + list(range(40000, 100000, 5000)) # $40k to $100k in $5k steps + + list(range(100000, 200001, 10000)) # $100k to $200k in $10k steps + ) + + print( + f"Creating dataset with {len(employment_incomes)} employment income variations..." + ) + dataset = create_dataset_with_varied_employment_income(employment_incomes) + + print("Running simulation (single run for all variations)...") + simulation = run_simulation(dataset) + + print("Extracting results using aggregate filters...") + results = extract_results_by_employment_income( + simulation, employment_incomes + ) + + print("\nSample results:") + for emp_inc in [0, 50000, 100000, 200000]: + idx = ( + employment_incomes.index(emp_inc) + if emp_inc in employment_incomes + else -1 + ) + if idx >= 0: + print( + f" Employment income ${emp_inc:,}: household net income ${results['household_net_income'][idx]:,.0f}" + ) + + print("\nGenerating visualisation...") + visualise_results(results) + + +if __name__ == "__main__": + main() diff --git a/examples/income_bands_uk.py b/examples/income_bands_uk.py new file mode 100644 index 00000000..f4f43c72 --- /dev/null +++ b/examples/income_bands_uk.py @@ -0,0 +1,190 @@ +"""Example: Calculate net income and tax by income decile using representative microdata. + +This script demonstrates: +1. Using representative household microdata +2. Running a full microsimulation to calculate income tax and net income +3. Using Aggregate to calculate statistics within income deciles using quantile filters +4. Visualising results with Plotly + +Run: python examples/income_bands.py +""" + +from pathlib import Path + +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +from policyengine.core import Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + uk_latest, +) + + +def load_representative_data(year: int = 2026) -> PolicyEngineUKDataset: + """Load representative household microdata for a given year.""" + dataset_path = Path(f"./data/enhanced_frs_2023_24_year_{year}.h5") + + if not dataset_path.exists(): + raise FileNotFoundError( + f"Dataset not found at {dataset_path}. " + "Run create_datasets() from policyengine.tax_benefit_models.uk first." + ) + + dataset = PolicyEngineUKDataset( + name=f"Enhanced FRS {year}", + description=f"Representative household microdata for {year}", + filepath=str(dataset_path), + year=year, + ) + dataset.load() + return dataset + + +def run_simulation(dataset: PolicyEngineUKDataset) -> Simulation: + """Run a microsimulation on the dataset.""" + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + ) + simulation.run() + return simulation + + +def calculate_income_decile_statistics(simulation: Simulation) -> dict: + """Calculate total income, tax, and population by income deciles.""" + deciles = [] + net_incomes = [] + taxes = [] + counts = [] + + for decile in range(1, 11): + net_income_agg = Aggregate( + simulation=simulation, + variable="household_net_income", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile, + ) + net_income_agg.run() + + tax_agg = Aggregate( + simulation=simulation, + variable="household_tax", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile, + ) + tax_agg.run() + + count_agg = Aggregate( + simulation=simulation, + variable="household_net_income", + aggregate_type=AggregateType.COUNT, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile, + ) + count_agg.run() + + deciles.append(f"Decile {decile}") + net_incomes.append(net_income_agg.result / 1e9) # Convert to billions + taxes.append(tax_agg.result / 1e9) + counts.append(count_agg.result / 1e6) # Convert to millions + + return { + "deciles": deciles, + "net_incomes": net_incomes, + "taxes": taxes, + "counts": counts, + } + + +def visualise_results(results: dict) -> None: + """Create visualisations of income decile statistics.""" + fig = make_subplots( + rows=1, + cols=3, + subplot_titles=( + "Net income by decile (£bn)", + "Tax by decile (£bn)", + "Households by decile (millions)", + ), + specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]], + ) + + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["net_incomes"], + marker_color="lightblue", + ), + row=1, + col=1, + ) + + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["taxes"], + marker_color="lightcoral", + ), + row=1, + col=2, + ) + + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["counts"], + marker_color="lightgreen", + ), + row=1, + col=3, + ) + + fig.update_xaxes(title_text="Income decile", row=1, col=1) + fig.update_xaxes(title_text="Income decile", row=1, col=2) + fig.update_xaxes(title_text="Income decile", row=1, col=3) + + fig.update_layout( + title_text="Household income and tax distribution", + showlegend=False, + height=400, + ) + + fig.show() + + +def main(): + """Main execution function.""" + print("Loading representative household data...") + dataset = load_representative_data(year=2026) + + print("Running microsimulation...") + simulation = run_simulation(dataset) + + print("Calculating statistics by income decile...") + results = calculate_income_decile_statistics(simulation) + + print("\nResults summary:") + total_net_income = sum(results["net_incomes"]) + total_tax = sum(results["taxes"]) + total_households = sum(results["counts"]) + + print(f"Total net income: £{total_net_income:.1f}bn") + print(f"Total tax: £{total_tax:.1f}bn") + print(f"Total households: {total_households:.1f}m") + print( + f"Average effective tax rate: {total_tax / (total_net_income + total_tax) * 100:.1f}%" + ) + + print("\nGenerating visualisations...") + visualise_results(results) + + +if __name__ == "__main__": + main() diff --git a/examples/income_distribution_us.py b/examples/income_distribution_us.py new file mode 100644 index 00000000..67417e13 --- /dev/null +++ b/examples/income_distribution_us.py @@ -0,0 +1,424 @@ +"""Example: Plot US household income distribution using enhanced CPS microdata. + +This script demonstrates: +1. Loading enhanced CPS representative household microdata +2. Running a full microsimulation to calculate household income and tax +3. Using Aggregate to calculate statistics within income deciles +4. Visualising the income distribution across the United States + +Run: python examples/income_distribution_us.py +""" + +import time +from pathlib import Path + +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +from policyengine.core import Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + us_latest, +) +from policyengine.utils.plotting import COLORS, format_fig + + +def load_representative_data(year: int = 2024) -> PolicyEngineUSDataset: + """Load representative household microdata for a given year.""" + dataset_path = ( + Path(__file__).parent / "data" / f"enhanced_cps_2024_year_{year}.h5" + ) + + if not dataset_path.exists(): + raise FileNotFoundError( + f"Dataset not found at {dataset_path}. " + "Run create_datasets() from policyengine.tax_benefit_models.us first." + ) + + dataset = PolicyEngineUSDataset( + name=f"Enhanced CPS {year}", + description=f"Representative household microdata for {year}", + filepath=str(dataset_path), + year=year, + ) + dataset.load() + return dataset + + +def run_simulation(dataset: PolicyEngineUSDataset) -> Simulation: + """Run a microsimulation on the dataset.""" + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=us_latest, + ) + simulation.run() + return simulation + + +def calculate_income_decile_statistics(simulation: Simulation) -> dict: + """Calculate total income, tax, and benefits by income deciles.""" + start_time = time.time() + deciles = [f"D{i}" for i in range(1, 11)] + market_incomes = [] + taxes = [] + benefits = [] + net_incomes = [] + counts = [] + + # Calculate household-level aggregates by decile + print("Calculating main statistics by decile...") + main_stats_start = time.time() + for decile_num in range(1, 11): + decile_start = time.time() + + # Market income + pre_create = time.time() + agg = Aggregate( + simulation=simulation, + variable="household_market_income", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + if decile_num == 1: + print( + f" First Aggregate created ({time.time() - pre_create:.2f}s)" + ) + pre_run = time.time() + agg.run() + if decile_num == 1: + print( + f" First Aggregate.run() complete ({time.time() - pre_run:.2f}s)" + ) + market_incomes.append(agg.result / 1e9) + + agg = Aggregate( + simulation=simulation, + variable="household_tax", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + agg.run() + taxes.append(agg.result / 1e9) + + agg = Aggregate( + simulation=simulation, + variable="household_benefits", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + agg.run() + benefits.append(agg.result / 1e9) + + agg = Aggregate( + simulation=simulation, + variable="household_net_income", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + agg.run() + net_incomes.append(agg.result / 1e9) + + agg = Aggregate( + simulation=simulation, + variable="household_weight", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + agg.run() + counts.append(agg.result / 1e6) + + print(f" D{decile_num} complete ({time.time() - decile_start:.2f}s)") + + print(f"Main statistics complete ({time.time() - main_stats_start:.2f}s)") + + # Calculate individual benefit programs by decile + benefit_programs_by_decile = {} + + # Person-level benefits (mapped to household for decile filtering) + print("Calculating person-level benefit programs...") + person_benefits_start = time.time() + first_prog = True + for prog in [ + "ssi", + "social_security", + "medicaid", + "unemployment_compensation", + ]: + prog_start = time.time() + prog_by_decile = [] + for decile_num in range(1, 11): + if first_prog and decile_num == 1: + pre_create = time.time() + agg = Aggregate( + simulation=simulation, + variable=prog, + entity="household", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + debug_timing=first_prog and decile_num == 1, + ) + if first_prog and decile_num == 1: + print( + f" First benefit Aggregate created ({time.time() - pre_create:.2f}s)" + ) + pre_run = time.time() + agg.run() + if first_prog and decile_num == 1: + print( + f" First benefit Aggregate.run() complete ({time.time() - pre_run:.2f}s)" + ) + first_prog = False + prog_by_decile.append(agg.result / 1e9) + benefit_programs_by_decile[prog] = prog_by_decile + print(f" {prog} complete ({time.time() - prog_start:.2f}s)") + + print( + f"Person-level benefits complete ({time.time() - person_benefits_start:.2f}s)" + ) + + # SPM unit benefits (mapped to household for decile filtering) + print("Calculating SPM unit benefit programs...") + spm_benefits_start = time.time() + for prog in ["snap", "tanf"]: + prog_start = time.time() + prog_by_decile = [] + for decile_num in range(1, 11): + agg = Aggregate( + simulation=simulation, + variable=prog, + entity="household", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + agg.run() + prog_by_decile.append(agg.result / 1e9) + benefit_programs_by_decile[prog] = prog_by_decile + print(f" {prog} complete ({time.time() - prog_start:.2f}s)") + + print(f"SPM benefits complete ({time.time() - spm_benefits_start:.2f}s)") + + # Tax unit benefits (mapped to household for decile filtering) + print("Calculating tax unit benefit programs...") + tax_benefits_start = time.time() + for prog in ["eitc", "ctc"]: + prog_start = time.time() + prog_by_decile = [] + for decile_num in range(1, 11): + agg = Aggregate( + simulation=simulation, + variable=prog, + entity="household", + aggregate_type=AggregateType.SUM, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile_num, + ) + agg.run() + prog_by_decile.append(agg.result / 1e9) + benefit_programs_by_decile[prog] = prog_by_decile + print(f" {prog} complete ({time.time() - prog_start:.2f}s)") + + print(f"Tax benefits complete ({time.time() - tax_benefits_start:.2f}s)") + print( + f"\nTotal statistics calculation time: {time.time() - start_time:.2f}s" + ) + + return { + "deciles": deciles, + "market_incomes": market_incomes, + "taxes": taxes, + "benefits": benefits, + "net_incomes": net_incomes, + "counts": counts, + "benefit_programs_by_decile": benefit_programs_by_decile, + } + + +def visualise_results(results: dict) -> None: + """Create visualisations of income distribution.""" + # Create overview figure + fig = make_subplots( + rows=2, + cols=2, + subplot_titles=( + "Market income by decile ($bn)", + "Tax by decile ($bn)", + "Benefits by program and decile ($bn)", + "Households by decile (millions)", + ), + specs=[ + [{"type": "bar"}, {"type": "bar"}], + [{"type": "bar"}, {"type": "bar"}], + ], + ) + + # Market income + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["market_incomes"], + marker_color=COLORS["primary"], + name="Market income", + showlegend=False, + ), + row=1, + col=1, + ) + + # Tax + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["taxes"], + marker_color=COLORS["error"], + name="Tax", + showlegend=False, + ), + row=1, + col=2, + ) + + # Benefits by program (stacked) - with legend + benefit_programs = [ + ("Social Security", "social_security", "#026AA2"), + ("Medicaid", "medicaid", "#319795"), + ("SNAP", "snap", "#22C55E"), + ("EITC", "eitc", "#FEC601"), + ("CTC", "ctc", "#1890FF"), + ("SSI", "ssi", "#EF4444"), + ("TANF", "tanf", "#667085"), + ("Unemployment", "unemployment_compensation", "#101828"), + ] + + for name, key, color in benefit_programs: + if key in results["benefit_programs_by_decile"]: + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["benefit_programs_by_decile"][key], + name=name, + marker_color=color, + legendgroup="benefits", + showlegend=True, + ), + row=2, + col=1, + ) + + # Household counts + fig.add_trace( + go.Bar( + x=results["deciles"], + y=results["counts"], + marker_color=COLORS["info"], + name="Households", + showlegend=False, + ), + row=2, + col=2, + ) + + fig.update_xaxes(title_text="Income decile", row=1, col=1) + fig.update_xaxes(title_text="Income decile", row=1, col=2) + fig.update_xaxes(title_text="Income decile", row=2, col=1) + fig.update_xaxes(title_text="Income decile", row=2, col=2) + + # Apply PolicyEngine formatting + format_fig( + fig, + title="US household income distribution (Enhanced CPS 2024)", + show_legend=True, + height=800, + width=1400, + ) + + # Override legend position for subplot layout + fig.update_layout( + barmode="stack", + legend=dict( + orientation="v", + yanchor="top", + y=0.45, + xanchor="left", + x=0.52, + bgcolor="white", + bordercolor="#E5E7EB", + borderwidth=1, + ), + ) + + fig.show() + + +def main(): + """Main execution function.""" + print("Loading enhanced CPS representative household data...") + dataset = load_representative_data(year=2024) + + print( + f"Dataset loaded: {len(dataset.data.person):,} people, {len(dataset.data.household):,} households" + ) + + print("Running microsimulation...") + simulation = run_simulation(dataset) + + print("Calculating statistics by income decile...") + results = calculate_income_decile_statistics(simulation) + + print("\nResults summary:") + total_market_income = sum(results["market_incomes"]) + total_tax = sum(results["taxes"]) + total_benefits = sum(results["benefits"]) + total_net_income = sum(results["net_incomes"]) + total_households = sum(results["counts"]) + + print(f"Total market income: ${total_market_income:.1f}bn") + print(f"Total tax: ${total_tax:.1f}bn") + print(f"Total benefits: ${total_benefits:.1f}bn") + print(f"Total net income: ${total_net_income:.1f}bn") + print(f"Total households: {total_households:.1f}m") + print( + f"Average effective tax rate: {total_tax / total_market_income * 100:.1f}%" + ) + + print("\nBenefit programs by decile:") + benefit_programs = [ + ("Social Security", "social_security"), + ("Medicaid", "medicaid"), + ("SNAP", "snap"), + ("EITC", "eitc"), + ("CTC", "ctc"), + ("SSI", "ssi"), + ("TANF", "tanf"), + ("Unemployment", "unemployment_compensation"), + ] + + for name, key in benefit_programs: + if key in results["benefit_programs_by_decile"]: + total = sum(results["benefit_programs_by_decile"][key]) + print(f"\n {name} (total: ${total:.1f}bn):") + for i, decile in enumerate(results["deciles"]): + value = results["benefit_programs_by_decile"][key][i] + print(f" {decile}: ${value:.1f}bn") + + print("\nGenerating visualisations...") + visualise_results(results) + + +if __name__ == "__main__": + main() diff --git a/examples/policy_change_uk.py b/examples/policy_change_uk.py new file mode 100644 index 00000000..d708448b --- /dev/null +++ b/examples/policy_change_uk.py @@ -0,0 +1,314 @@ +"""Example: Analyse policy change impacts using ChangeAggregate with parametric reforms. + +This script demonstrates: +1. Loading representative household microdata +2. Applying parametric reforms (e.g., setting personal allowance to zero) +3. Running baseline and reform simulations +4. Using ChangeAggregate to analyse winners, losers, and impact sizes by income decile +5. Using quantile filters for decile-based analysis +6. Visualising results with Plotly + +Run: python examples/policy_change.py +""" + +import datetime +from pathlib import Path + +import plotly.graph_objects as go +from plotly.subplots import make_subplots + +from policyengine.core import Parameter, ParameterValue, Policy, Simulation +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + uk_latest, +) + + +def load_representative_data(year: int = 2026) -> PolicyEngineUKDataset: + """Load representative household microdata for a given year.""" + dataset_path = Path(f"./data/enhanced_frs_2023_24_year_{year}.h5") + + if not dataset_path.exists(): + raise FileNotFoundError( + f"Dataset not found at {dataset_path}. " + "Run create_datasets() from policyengine.tax_benefit_models.uk first." + ) + + dataset = PolicyEngineUKDataset( + name=f"Enhanced FRS {year}", + description=f"Representative household microdata for {year}", + filepath=str(dataset_path), + year=year, + ) + dataset.load() + return dataset + + +def create_personal_allowance_reform(year: int) -> Policy: + """Create a policy that sets personal allowance to zero.""" + parameter = Parameter( + id=f"{uk_latest.id}-gov.hmrc.income_tax.allowances.personal_allowance.amount", + name="gov.hmrc.income_tax.allowances.personal_allowance.amount", + tax_benefit_model_version=uk_latest, + description="Personal allowance for income tax", + data_type=float, + ) + + parameter_value = ParameterValue( + parameter=parameter, + start_date=datetime.date(year, 1, 1), + end_date=datetime.date(year, 12, 31), + value=0, + ) + + return Policy( + name="Zero personal allowance", + description="Sets personal allowance to £0", + parameter_values=[parameter_value], + ) + + +def run_baseline_simulation(dataset: PolicyEngineUKDataset) -> Simulation: + """Run baseline microsimulation without policy changes.""" + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + ) + simulation.run() + return simulation + + +def run_reform_simulation( + dataset: PolicyEngineUKDataset, policy: Policy +) -> Simulation: + """Run reform microsimulation with policy changes.""" + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + policy=policy, + ) + simulation.run() + return simulation + + +def analyse_overall_impact( + baseline_sim: Simulation, reform_sim: Simulation +) -> dict: + """Analyse overall winners, losers, and financial impact.""" + winners = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=1, + ) + winners.run() + + losers = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_leq=-1, + ) + losers.run() + + no_change = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_eq=0, + ) + no_change.run() + + total_change = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.SUM, + ) + total_change.run() + + tax_revenue_change = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_tax", + aggregate_type=ChangeAggregateType.SUM, + ) + tax_revenue_change.run() + + return { + "winners": winners.result / 1e6, # Convert to millions + "losers": losers.result / 1e6, + "no_change": no_change.result / 1e6, + "total_change": total_change.result / 1e9, # Convert to billions + "tax_revenue_change": tax_revenue_change.result / 1e9, + } + + +def analyse_impact_by_income_decile( + baseline_sim: Simulation, reform_sim: Simulation +) -> dict: + """Analyse impact by income decile.""" + decile_labels = [] + decile_losers = [] + decile_avg_loss = [] + + for decile in range(1, 11): + label = f"Decile {decile}" + + # Count losers in this decile + count_agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.COUNT, + change_leq=-1, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile, + ) + count_agg.run() + + # Average loss for all households in this decile + mean_agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="household_net_income", + aggregate_type=ChangeAggregateType.MEAN, + filter_variable="household_net_income", + quantile=10, + quantile_eq=decile, + ) + mean_agg.run() + + decile_labels.append(label) + decile_losers.append(count_agg.result / 1e6) # Convert to millions + decile_avg_loss.append(mean_agg.result) + + return { + "labels": decile_labels, + "losers": decile_losers, + "avg_loss": decile_avg_loss, + } + + +def visualise_results( + overall: dict, by_decile: dict, reform_name: str +) -> None: + """Create visualisations of policy change impacts.""" + fig = make_subplots( + rows=1, + cols=3, + subplot_titles=( + "Winners vs losers (millions)", + "Losers by income decile (millions)", + "Average loss by income decile (£)", + ), + specs=[[{"type": "bar"}, {"type": "bar"}, {"type": "bar"}]], + ) + + fig.add_trace( + go.Bar( + x=["Winners", "No change", "Losers"], + y=[ + overall["winners"], + overall["no_change"], + overall["losers"], + ], + marker_color=["green", "gray", "red"], + ), + row=1, + col=1, + ) + + fig.add_trace( + go.Bar( + x=by_decile["labels"], + y=by_decile["losers"], + marker_color="lightcoral", + ), + row=1, + col=2, + ) + + fig.add_trace( + go.Bar( + x=by_decile["labels"], + y=by_decile["avg_loss"], + marker_color="orange", + ), + row=1, + col=3, + ) + + fig.update_xaxes(title_text="Category", row=1, col=1) + fig.update_xaxes(title_text="Income decile", row=1, col=2) + fig.update_xaxes(title_text="Income decile", row=1, col=3) + + fig.update_layout( + title_text=f"Policy change impact analysis: {reform_name}", + showlegend=False, + height=400, + ) + + fig.show() + + +def print_summary(overall: dict, decile: dict, reform_name: str) -> None: + """Print summary statistics.""" + print("=" * 60) + print(f"Policy change impact summary: {reform_name}") + print("=" * 60) + print("\nOverall impact:") + print(f" Winners: {overall['winners']:.2f}m households") + print(f" Losers: {overall['losers']:.2f}m households") + print(f" No change: {overall['no_change']:.2f}m households") + print("\nFinancial impact:") + print( + f" Net income change: £{overall['total_change']:.2f}bn (negative = loss)" + ) + print(f" Tax revenue change: £{overall['tax_revenue_change']:.2f}bn") + print("\nImpact by income decile:") + for i, label in enumerate(decile["labels"]): + print( + f" {label}: {decile['losers'][i]:.2f}m losers, avg change £{decile['avg_loss'][i]:.0f}" + ) + print("=" * 60) + + +def main(): + """Main execution function.""" + year = 2026 + + print("Loading representative household data...") + dataset = load_representative_data(year=year) + + print("Creating policy reform (zero personal allowance)...") + reform = create_personal_allowance_reform(year=year) + + print("Running baseline simulation...") + baseline_sim = run_baseline_simulation(dataset) + + print("Running reform simulation...") + reform_sim = run_reform_simulation(dataset, reform) + + print("Analysing overall impact...") + overall_impact = analyse_overall_impact(baseline_sim, reform_sim) + + print("Analysing impact by income decile...") + decile_impact = analyse_impact_by_income_decile(baseline_sim, reform_sim) + + print_summary(overall_impact, decile_impact, reform.name) + + print("\nGenerating visualisations...") + visualise_results(overall_impact, decile_impact, reform.name) + + +if __name__ == "__main__": + main() diff --git a/examples/speedtest_us_simulation.py b/examples/speedtest_us_simulation.py new file mode 100644 index 00000000..e0b18fb0 --- /dev/null +++ b/examples/speedtest_us_simulation.py @@ -0,0 +1,318 @@ +"""Speedtest: US simulation performance with different dataset sizes. + +This script tests how simulation.run() performance scales with dataset size +by running simulations on random subsets of households. +""" + +import time +from pathlib import Path + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + USYearData, + us_latest, +) + + +def create_subset_dataset( + original_dataset: PolicyEngineUSDataset, n_households: int +) -> PolicyEngineUSDataset: + """Create a random subset of the dataset with n_households and reindexed entity IDs.""" + # Get original data + household_df = pd.DataFrame(original_dataset.data.household).copy() + person_df = pd.DataFrame(original_dataset.data.person).copy() + marital_unit_df = pd.DataFrame(original_dataset.data.marital_unit).copy() + family_df = pd.DataFrame(original_dataset.data.family).copy() + spm_unit_df = pd.DataFrame(original_dataset.data.spm_unit).copy() + tax_unit_df = pd.DataFrame(original_dataset.data.tax_unit).copy() + + # Sample random households (use n as seed to get different samples for different sizes) + sampled_households = household_df.sample( + n=n_households, random_state=n_households + ).copy() + sampled_household_ids = set(sampled_households["household_id"]) + + # Determine column naming convention + household_id_col = ( + "person_household_id" + if "person_household_id" in person_df.columns + else "household_id" + ) + marital_unit_id_col = ( + "person_marital_unit_id" + if "person_marital_unit_id" in person_df.columns + else "marital_unit_id" + ) + family_id_col = ( + "person_family_id" + if "person_family_id" in person_df.columns + else "family_id" + ) + spm_unit_id_col = ( + "person_spm_unit_id" + if "person_spm_unit_id" in person_df.columns + else "spm_unit_id" + ) + tax_unit_id_col = ( + "person_tax_unit_id" + if "person_tax_unit_id" in person_df.columns + else "tax_unit_id" + ) + + # Filter person table to only include people in sampled households + sampled_person = person_df[ + person_df[household_id_col].isin(sampled_household_ids) + ].copy() + + # Get IDs of group entities that have members in sampled households + sampled_marital_unit_ids = set( + sampled_person[marital_unit_id_col].unique() + ) + sampled_family_ids = set(sampled_person[family_id_col].unique()) + sampled_spm_unit_ids = set(sampled_person[spm_unit_id_col].unique()) + sampled_tax_unit_ids = set(sampled_person[tax_unit_id_col].unique()) + + # Filter group entity tables + sampled_marital_unit = marital_unit_df[ + marital_unit_df["marital_unit_id"].isin(sampled_marital_unit_ids) + ].copy() + sampled_family = family_df[ + family_df["family_id"].isin(sampled_family_ids) + ].copy() + sampled_spm_unit = spm_unit_df[ + spm_unit_df["spm_unit_id"].isin(sampled_spm_unit_ids) + ].copy() + sampled_tax_unit = tax_unit_df[ + tax_unit_df["tax_unit_id"].isin(sampled_tax_unit_ids) + ].copy() + + # Create ID mappings to reindex to contiguous integers starting from 0 + household_id_map = { + old_id: new_id + for new_id, old_id in enumerate(sorted(sampled_household_ids)) + } + marital_unit_id_map = { + old_id: new_id + for new_id, old_id in enumerate(sorted(sampled_marital_unit_ids)) + } + family_id_map = { + old_id: new_id + for new_id, old_id in enumerate(sorted(sampled_family_ids)) + } + spm_unit_id_map = { + old_id: new_id + for new_id, old_id in enumerate(sorted(sampled_spm_unit_ids)) + } + tax_unit_id_map = { + old_id: new_id + for new_id, old_id in enumerate(sorted(sampled_tax_unit_ids)) + } + person_id_map = { + old_id: new_id + for new_id, old_id in enumerate(sorted(sampled_person["person_id"])) + } + + # Reindex all entity IDs in household table + sampled_households["household_id"] = sampled_households[ + "household_id" + ].map(household_id_map) + + # Reindex all entity IDs in person table + sampled_person["person_id"] = sampled_person["person_id"].map( + person_id_map + ) + sampled_person[household_id_col] = sampled_person[household_id_col].map( + household_id_map + ) + sampled_person[marital_unit_id_col] = sampled_person[ + marital_unit_id_col + ].map(marital_unit_id_map) + sampled_person[family_id_col] = sampled_person[family_id_col].map( + family_id_map + ) + sampled_person[spm_unit_id_col] = sampled_person[spm_unit_id_col].map( + spm_unit_id_map + ) + sampled_person[tax_unit_id_col] = sampled_person[tax_unit_id_col].map( + tax_unit_id_map + ) + + # Reindex group entity tables + sampled_marital_unit["marital_unit_id"] = sampled_marital_unit[ + "marital_unit_id" + ].map(marital_unit_id_map) + sampled_family["family_id"] = sampled_family["family_id"].map( + family_id_map + ) + sampled_spm_unit["spm_unit_id"] = sampled_spm_unit["spm_unit_id"].map( + spm_unit_id_map + ) + sampled_tax_unit["tax_unit_id"] = sampled_tax_unit["tax_unit_id"].map( + tax_unit_id_map + ) + + # Sort by ID to ensure proper ordering + sampled_households = sampled_households.sort_values( + "household_id" + ).reset_index(drop=True) + sampled_person = sampled_person.sort_values("person_id").reset_index( + drop=True + ) + sampled_marital_unit = sampled_marital_unit.sort_values( + "marital_unit_id" + ).reset_index(drop=True) + sampled_family = sampled_family.sort_values("family_id").reset_index( + drop=True + ) + sampled_spm_unit = sampled_spm_unit.sort_values("spm_unit_id").reset_index( + drop=True + ) + sampled_tax_unit = sampled_tax_unit.sort_values("tax_unit_id").reset_index( + drop=True + ) + + # Create new dataset + subset_dataset = PolicyEngineUSDataset( + name=f"Subset {n_households} households", + description=f"Random subset of {n_households} households", + filepath=f"./data/subset_{n_households}_households.h5", + year=original_dataset.year, + data=USYearData( + person=MicroDataFrame(sampled_person, weights="person_weight"), + household=MicroDataFrame( + sampled_households, weights="household_weight" + ), + marital_unit=MicroDataFrame( + sampled_marital_unit, weights="marital_unit_weight" + ), + family=MicroDataFrame(sampled_family, weights="family_weight"), + spm_unit=MicroDataFrame( + sampled_spm_unit, weights="spm_unit_weight" + ), + tax_unit=MicroDataFrame( + sampled_tax_unit, weights="tax_unit_weight" + ), + ), + ) + + return subset_dataset + + +def speedtest_simulation(dataset: PolicyEngineUSDataset) -> float: + """Run simulation and return execution time in seconds.""" + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=us_latest, + ) + + start_time = time.time() + simulation.run() + end_time = time.time() + + return end_time - start_time + + +def main(): + print("Loading full enhanced CPS dataset...") + dataset_path = ( + Path(__file__).parent / "data" / "enhanced_cps_2024_year_2024.h5" + ) + + if not dataset_path.exists(): + raise FileNotFoundError( + f"Dataset not found at {dataset_path}. " + "Run create_datasets() from policyengine.tax_benefit_models.us first." + ) + + full_dataset = PolicyEngineUSDataset( + name="Enhanced CPS 2024", + description="Full enhanced CPS dataset", + filepath=str(dataset_path), + year=2024, + ) + full_dataset.load() + + total_households = len(full_dataset.data.household) + print(f"Full dataset: {total_households:,} households") + + # Test different subset sizes + test_sizes = [ + 100, + 500, + 1000, + 2500, + 5000, + 10000, + 21532, + ] # Last is full size + + results = [] + + for n_households in test_sizes: + if n_households > total_households: + continue + + print(f"\nTesting {n_households:,} households...") + + if n_households == total_households: + subset = full_dataset + else: + subset = create_subset_dataset(full_dataset, n_households) + + n_people = len(subset.data.person) + print(f" {n_people:,} people in subset") + + duration = speedtest_simulation(subset) + print(f" Simulation completed in {duration:.2f}s") + + results.append( + { + "households": n_households, + "people": n_people, + "duration_seconds": duration, + "households_per_second": n_households / duration, + } + ) + + print("\n" + "=" * 60) + print("SPEEDTEST RESULTS") + print("=" * 60) + print(f"{'Households':<12} {'People':<10} {'Duration':<12} {'HH/sec':<10}") + print("-" * 60) + + for result in results: + print( + f"{result['households']:<12,} {result['people']:<10,} " + f"{result['duration_seconds']:<12.2f} {result['households_per_second']:<10.1f}" + ) + + # Calculate scaling characteristics + print("\n" + "=" * 60) + print("SCALING ANALYSIS") + print("=" * 60) + + if len(results) >= 2: + # Compare first and last results + first = results[0] + last = results[-1] + + size_ratio = last["households"] / first["households"] + time_ratio = last["duration_seconds"] / first["duration_seconds"] + + print(f"Dataset size increased {size_ratio:.1f}x") + print(f"Simulation time increased {time_ratio:.1f}x") + + if time_ratio < size_ratio * 1.2: + print("Scaling: approximately linear or better") + elif time_ratio < size_ratio * 2: + print("Scaling: slightly worse than linear") + else: + print("Scaling: significantly worse than linear") + + +if __name__ == "__main__": + main() diff --git a/pyproject.toml b/pyproject.toml index 251a2d68..4df10f82 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,20 +13,11 @@ authors = [ license = {file = "LICENSE"} requires-python = ">=3.13" dependencies = [ - "sqlalchemy>=2.0.0", - "sqlmodel>=0.0.21", - "alembic>=1.13.0", - "psycopg2-binary>=2.9.0", - "pymysql>=1.1.0", - "google-cloud-storage>=2.10.0", - "getpass4", "pydantic>=2.0.0", "pandas>=2.0.0", - "rich>=13.0.0", - "ipywidgets>=8.0.0", "microdf_python", - "tqdm>=4.67.1", - "blosc>=1.11.3", + "plotly>=5.0.0", + "requests>=2.31.0", ] [project.optional-dependencies] @@ -64,14 +55,14 @@ where = ["src"] [tool.setuptools.package-data] "policyengine" = ["**/*"] -[project.scripts] -pe-migrate = "policyengine.migrations.runner:main" - [tool.pytest.ini_options] addopts = "-v" testpaths = [ "tests", ] +filterwarnings = [ + "ignore::pydantic.warnings.PydanticDeprecatedSince20", +] [tool.black] line-length = 79 diff --git a/src/policyengine/core/__init__.py b/src/policyengine/core/__init__.py new file mode 100644 index 00000000..b96e8edd --- /dev/null +++ b/src/policyengine/core/__init__.py @@ -0,0 +1,22 @@ +from .dataset import Dataset +from .dataset import map_to_entity as map_to_entity +from .dataset_version import DatasetVersion as DatasetVersion +from .dynamic import Dynamic as Dynamic +from .output import Output as Output +from .output import OutputCollection as OutputCollection +from .parameter import Parameter as Parameter +from .parameter_value import ParameterValue as ParameterValue +from .policy import Policy as Policy +from .simulation import Simulation as Simulation +from .tax_benefit_model import TaxBenefitModel as TaxBenefitModel +from .tax_benefit_model_version import ( + TaxBenefitModelVersion as TaxBenefitModelVersion, +) +from .variable import Variable as Variable + +# Rebuild models to resolve forward references +Dataset.model_rebuild() +TaxBenefitModelVersion.model_rebuild() +Variable.model_rebuild() +Parameter.model_rebuild() +ParameterValue.model_rebuild() diff --git a/src/policyengine/core/dataset.py b/src/policyengine/core/dataset.py new file mode 100644 index 00000000..a79c0b6d --- /dev/null +++ b/src/policyengine/core/dataset.py @@ -0,0 +1,260 @@ +from uuid import uuid4 + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, ConfigDict, Field + +from .dataset_version import DatasetVersion +from .tax_benefit_model import TaxBenefitModel + + +class Dataset(BaseModel): + """Base class for datasets. + + The data field contains entity-level data as a BaseModel with DataFrame fields. + + Example: + class YearData(BaseModel): + model_config = ConfigDict(arbitrary_types_allowed=True) + person: pd.DataFrame + household: pd.DataFrame + + class MyDataset(Dataset): + data: YearData | None = None + """ + + model_config = ConfigDict(arbitrary_types_allowed=True) + + id: str = Field(default_factory=lambda: str(uuid4())) + name: str + description: str + dataset_version: DatasetVersion | None = None + filepath: str + is_output_dataset: bool = False + tax_benefit_model: TaxBenefitModel | None = None + year: int + + data: BaseModel | None = None + + +def map_to_entity( + entity_data: dict[str, MicroDataFrame], + source_entity: str, + target_entity: str, + person_entity: str = "person", + columns: list[str] | None = None, +) -> MicroDataFrame: + """Map data from source entity to target entity using join keys. + + This is a generic entity mapping utility that handles: + - Same entity mapping (returns as is) + - Person to group entity mapping (aggregates values) + - Group to person entity mapping (expands values) + - Group to group entity mapping (aggregates through person entity) + + Args: + entity_data: Dictionary mapping entity names to their MicroDataFrame data + source_entity: The source entity name + target_entity: The target entity name + person_entity: The name of the person entity (default "person") + columns: List of column names to map. If None, maps all columns + + Returns: + MicroDataFrame: The mapped data at the target entity level + + Raises: + ValueError: If source or target entity is invalid + """ + valid_entities = set(entity_data.keys()) + + if source_entity not in valid_entities: + raise ValueError( + f"Invalid source entity '{source_entity}'. Must be one of {valid_entities}" + ) + if target_entity not in valid_entities: + raise ValueError( + f"Invalid target entity '{target_entity}'. Must be one of {valid_entities}" + ) + + # Get source data (convert to plain DataFrame to avoid weighted operations during mapping) + source_df = pd.DataFrame(entity_data[source_entity]) + + if columns: + # Select only requested columns (keep all ID columns for joins) + id_cols = {col for col in source_df.columns if col.endswith("_id")} + cols_to_keep = list(set(columns) | id_cols) + source_df = source_df[cols_to_keep] + + # Determine weight column for target entity + target_weight = f"{target_entity}_weight" + + # Same entity - return as is + if source_entity == target_entity: + return MicroDataFrame(source_df, weights=target_weight) + + # Get target data and key + target_df = entity_data[target_entity] + target_key = f"{target_entity}_id" + + # Person to group entity: aggregate person-level data to group level + if source_entity == person_entity and target_entity != person_entity: + # Check for both naming patterns: "entity_id" and "person_entity_id" + person_target_key = f"{person_entity}_{target_entity}_id" + join_key = ( + person_target_key + if person_target_key in source_df.columns + else target_key + ) + + if join_key in source_df.columns: + # Get columns to aggregate (exclude ID and weight columns) + id_cols = {col for col in source_df.columns if col.endswith("_id")} + weight_cols = { + col for col in source_df.columns if col.endswith("_weight") + } + agg_cols = [ + c + for c in source_df.columns + if c not in id_cols and c not in weight_cols + ] + + # Group by join key and sum + aggregated = source_df.groupby(join_key, as_index=False)[ + agg_cols + ].sum() + + # Rename join key to target key if needed + if join_key != target_key: + aggregated = aggregated.rename(columns={join_key: target_key}) + + # Merge with target, preserving original order + target_pd = pd.DataFrame(target_df)[[target_key, target_weight]] + target_pd = target_pd.reset_index(drop=False) + result = target_pd.merge(aggregated, on=target_key, how="left") + + # Sort back to original order + result = ( + result.sort_values("index") + .drop("index", axis=1) + .reset_index(drop=True) + ) + + # Fill NaN with 0 for groups with no members in source entity + result[agg_cols] = result[agg_cols].fillna(0) + + return MicroDataFrame(result, weights=target_weight) + + # Group entity to person: expand group-level data to person level + if source_entity != person_entity and target_entity == person_entity: + source_key = f"{source_entity}_id" + # Check for both naming patterns + person_source_key = f"{person_entity}_{source_entity}_id" + + target_pd = pd.DataFrame(target_df) + join_key = ( + person_source_key + if person_source_key in target_pd.columns + else source_key + ) + + if join_key in target_pd.columns: + # Rename source key to match join key if needed + if join_key != source_key and source_key in source_df.columns: + source_df = source_df.rename(columns={source_key: join_key}) + + result = target_pd.merge(source_df, on=join_key, how="left") + return MicroDataFrame(result, weights=target_weight) + + # Group to group: go through person table + if source_entity != person_entity and target_entity != person_entity: + # Get person link table with both entity IDs + person_df = pd.DataFrame(entity_data[person_entity]) + source_key = f"{source_entity}_id" + + # Check for both naming patterns for person-level links + person_source_key = f"{person_entity}_{source_entity}_id" + person_target_key = f"{person_entity}_{target_entity}_id" + + # Determine which keys exist in person table + source_link_key = ( + person_source_key + if person_source_key in person_df.columns + else source_key + ) + target_link_key = ( + person_target_key + if person_target_key in person_df.columns + else target_key + ) + + # Link source -> person -> target + if ( + source_link_key in person_df.columns + and target_link_key in person_df.columns + ): + person_link = person_df[ + [source_link_key, target_link_key] + ].drop_duplicates() + + # Rename source key to match link key if needed + source_df_copy = source_df.copy() + if ( + source_link_key != source_key + and source_key in source_df_copy.columns + ): + source_df_copy = source_df_copy.rename( + columns={source_key: source_link_key} + ) + + # Join source data with target key + source_with_target = source_df_copy.merge( + person_link, on=source_link_key, how="left" + ) + + # Aggregate to target level + id_cols = { + col + for col in source_with_target.columns + if col.endswith("_id") + } + weight_cols = { + col + for col in source_with_target.columns + if col.endswith("_weight") + } + agg_cols = [ + c + for c in source_with_target.columns + if c not in id_cols and c not in weight_cols + ] + + aggregated = source_with_target.groupby( + target_link_key, as_index=False + )[agg_cols].sum() + + # Rename target link key to target key if needed + if target_link_key != target_key: + aggregated = aggregated.rename( + columns={target_link_key: target_key} + ) + + # Merge with target, preserving original order + target_pd = pd.DataFrame(target_df)[[target_key, target_weight]] + target_pd = target_pd.reset_index(drop=False) + result = target_pd.merge(aggregated, on=target_key, how="left") + + # Sort back to original order + result = ( + result.sort_values("index") + .drop("index", axis=1) + .reset_index(drop=True) + ) + + # Fill NaN with 0 + result[agg_cols] = result[agg_cols].fillna(0) + + return MicroDataFrame(result, weights=target_weight) + + raise ValueError( + f"Unsupported mapping from {source_entity} to {target_entity}" + ) diff --git a/src/policyengine/core/dataset_version.py b/src/policyengine/core/dataset_version.py new file mode 100644 index 00000000..711cd7d7 --- /dev/null +++ b/src/policyengine/core/dataset_version.py @@ -0,0 +1,16 @@ +from typing import TYPE_CHECKING +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from .tax_benefit_model import TaxBenefitModel + +if TYPE_CHECKING: + from .dataset import Dataset + + +class DatasetVersion(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + dataset: "Dataset" + description: str + tax_benefit_model: TaxBenefitModel = None diff --git a/src/policyengine/models/dynamic.py b/src/policyengine/core/dynamic.py similarity index 82% rename from src/policyengine/models/dynamic.py rename to src/policyengine/core/dynamic.py index 40cf364f..9b312952 100644 --- a/src/policyengine/models/dynamic.py +++ b/src/policyengine/core/dynamic.py @@ -4,12 +4,14 @@ from pydantic import BaseModel, Field +from .parameter_value import ParameterValue + class Dynamic(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) name: str description: str | None = None - parameter_values: list[str] = [] + parameter_values: list[ParameterValue] = [] simulation_modifier: Callable | None = None created_at: datetime = Field(default_factory=datetime.now) updated_at: datetime = Field(default_factory=datetime.now) diff --git a/src/policyengine/core/output.py b/src/policyengine/core/output.py new file mode 100644 index 00000000..a4bf969a --- /dev/null +++ b/src/policyengine/core/output.py @@ -0,0 +1,26 @@ +from typing import TypeVar + +import pandas as pd +from pydantic import BaseModel, ConfigDict + +T = TypeVar("T", bound="Output") + + +class Output(BaseModel): + """Base class for all output templates.""" + + def run(self): + """Calculate and populate the output fields. + + Must be implemented by subclasses. + """ + raise NotImplementedError("Subclasses must implement run()") + + +class OutputCollection[T: "Output"](BaseModel): + """Container for a collection of outputs with their DataFrame representation.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + outputs: list[T] + dataframe: pd.DataFrame diff --git a/src/policyengine/models/parameter.py b/src/policyengine/core/parameter.py similarity index 58% rename from src/policyengine/models/parameter.py rename to src/policyengine/core/parameter.py index c438f4f6..54e3e116 100644 --- a/src/policyengine/models/parameter.py +++ b/src/policyengine/core/parameter.py @@ -2,11 +2,13 @@ from pydantic import BaseModel, Field -from .model import Model +from .tax_benefit_model_version import TaxBenefitModelVersion class Parameter(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) + name: str description: str | None = None data_type: type | None = None - model: Model | None = None + tax_benefit_model_version: TaxBenefitModelVersion + unit: str | None = None diff --git a/src/policyengine/models/parameter_value.py b/src/policyengine/core/parameter_value.py similarity index 89% rename from src/policyengine/models/parameter_value.py rename to src/policyengine/core/parameter_value.py index a7867557..c997d794 100644 --- a/src/policyengine/models/parameter_value.py +++ b/src/policyengine/core/parameter_value.py @@ -8,7 +8,7 @@ class ParameterValue(BaseModel): id: str = Field(default_factory=lambda: str(uuid4())) - parameter: Parameter + parameter: Parameter | None = None value: float | int | str | bool | list | None = None start_date: datetime end_date: datetime | None = None diff --git a/src/policyengine/models/policy.py b/src/policyengine/core/policy.py similarity index 100% rename from src/policyengine/models/policy.py rename to src/policyengine/core/policy.py diff --git a/src/policyengine/models/simulation.py b/src/policyengine/core/simulation.py similarity index 51% rename from src/policyengine/models/simulation.py rename to src/policyengine/core/simulation.py index 8993ebe6..1e493b9a 100644 --- a/src/policyengine/models/simulation.py +++ b/src/policyengine/core/simulation.py @@ -1,14 +1,12 @@ from datetime import datetime -from typing import Any from uuid import uuid4 from pydantic import BaseModel, Field from .dataset import Dataset from .dynamic import Dynamic -from .model import Model -from .model_version import ModelVersion from .policy import Policy +from .tax_benefit_model_version import TaxBenefitModelVersion class Simulation(BaseModel): @@ -18,17 +16,15 @@ class Simulation(BaseModel): policy: Policy | None = None dynamic: Dynamic | None = None - dataset: Dataset + dataset: Dataset = None - model: Model - model_version: ModelVersion - result: Any | None = None + tax_benefit_model_version: TaxBenefitModelVersion = None + output_dataset: Dataset | None = None + + variables: dict[str, list[str]] | None = Field( + default=None, + description="Optional dictionary mapping entity names to lists of variable names to calculate. If None, uses model defaults.", + ) def run(self): - self.result = self.model.simulation_function( - dataset=self.dataset, - policy=self.policy, - dynamic=self.dynamic, - ) - self.updated_at = datetime.now() - return self.result + self.tax_benefit_model_version.run(self) diff --git a/src/policyengine/core/tax_benefit_model.py b/src/policyengine/core/tax_benefit_model.py new file mode 100644 index 00000000..02cb94ef --- /dev/null +++ b/src/policyengine/core/tax_benefit_model.py @@ -0,0 +1,11 @@ +from typing import TYPE_CHECKING + +from pydantic import BaseModel + +if TYPE_CHECKING: + pass + + +class TaxBenefitModel(BaseModel): + id: str + description: str | None = None diff --git a/src/policyengine/core/tax_benefit_model_version.py b/src/policyengine/core/tax_benefit_model_version.py new file mode 100644 index 00000000..8555c6f6 --- /dev/null +++ b/src/policyengine/core/tax_benefit_model_version.py @@ -0,0 +1,34 @@ +from datetime import datetime +from typing import TYPE_CHECKING +from uuid import uuid4 + +from pydantic import BaseModel, Field + +from .tax_benefit_model import TaxBenefitModel + +if TYPE_CHECKING: + from .parameter import Parameter + from .parameter_value import ParameterValue + from .simulation import Simulation + from .variable import Variable + + +class TaxBenefitModelVersion(BaseModel): + id: str = Field(default_factory=lambda: str(uuid4())) + model: TaxBenefitModel + version: str + description: str | None = None + created_at: datetime | None = Field(default_factory=datetime.utcnow) + + variables: list["Variable"] = Field(default_factory=list) + parameters: list["Parameter"] = Field(default_factory=list) + parameter_values: list["ParameterValue"] = Field(default_factory=list) + + def run(self, simulation: "Simulation") -> "Simulation": + raise NotImplementedError( + "The TaxBenefitModel class must define a method to execute simulations." + ) + + def __repr__(self) -> str: + # Give the id and version, and the number of variables, parameters, parameter values + return f"" diff --git a/src/policyengine/core/variable.py b/src/policyengine/core/variable.py new file mode 100644 index 00000000..24375120 --- /dev/null +++ b/src/policyengine/core/variable.py @@ -0,0 +1,15 @@ +from typing import Any + +from pydantic import BaseModel + +from .tax_benefit_model_version import TaxBenefitModelVersion + + +class Variable(BaseModel): + id: str + name: str + tax_benefit_model_version: TaxBenefitModelVersion + entity: str + description: str | None = None + data_type: type = None + possible_values: list[Any] | None = None diff --git a/src/policyengine/database/__init__.py b/src/policyengine/database/__init__.py deleted file mode 100644 index 2490d15c..00000000 --- a/src/policyengine/database/__init__.py +++ /dev/null @@ -1,56 +0,0 @@ -from .baseline_parameter_value_table import ( - BaselineParameterValueTable, - baseline_parameter_value_table_link, -) -from .baseline_variable_table import ( - BaselineVariableTable, - baseline_variable_table_link, -) -from .database import Database -from .dataset_table import DatasetTable, dataset_table_link -from .dynamic_table import DynamicTable, dynamic_table_link -from .link import TableLink - -# Import all table classes and links -from .model_table import ModelTable, model_table_link -from .model_version_table import ModelVersionTable, model_version_table_link -from .parameter_table import ParameterTable, parameter_table_link -from .parameter_value_table import ( - ParameterValueTable, - parameter_value_table_link, -) -from .policy_table import PolicyTable, policy_table_link -from .simulation_table import SimulationTable, simulation_table_link -from .versioned_dataset_table import ( - VersionedDatasetTable, - versioned_dataset_table_link, -) - -__all__ = [ - "Database", - "TableLink", - # Tables - "ModelTable", - "ModelVersionTable", - "DatasetTable", - "VersionedDatasetTable", - "PolicyTable", - "DynamicTable", - "ParameterTable", - "ParameterValueTable", - "BaselineParameterValueTable", - "BaselineVariableTable", - "SimulationTable", - # Links - "model_table_link", - "model_version_table_link", - "dataset_table_link", - "versioned_dataset_table_link", - "policy_table_link", - "dynamic_table_link", - "parameter_table_link", - "parameter_value_table_link", - "baseline_parameter_value_table_link", - "baseline_variable_table_link", - "simulation_table_link", -] diff --git a/src/policyengine/database/aggregate.py b/src/policyengine/database/aggregate.py deleted file mode 100644 index b945ea6c..00000000 --- a/src/policyengine/database/aggregate.py +++ /dev/null @@ -1,33 +0,0 @@ -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.database.link import TableLink -from policyengine.models.aggregate import Aggregate - - -class AggregateTable(SQLModel, table=True): - __tablename__ = "aggregates" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - simulation_id: str = Field( - foreign_key="simulations.id", ondelete="CASCADE" - ) - entity: str - variable_name: str - year: int | None = None - filter_variable_name: str | None = None - filter_variable_value: str | None = None - filter_variable_leq: float | None = None - filter_variable_geq: float | None = None - aggregate_function: str - value: float | None = None - - -aggregate_table_link = TableLink( - model_cls=Aggregate, - table_cls=AggregateTable, - model_to_table_custom_transforms=dict( - simulation_id=lambda a: a.simulation.id, - ), -) diff --git a/src/policyengine/database/baseline_parameter_value_table.py b/src/policyengine/database/baseline_parameter_value_table.py deleted file mode 100644 index 49282996..00000000 --- a/src/policyengine/database/baseline_parameter_value_table.py +++ /dev/null @@ -1,66 +0,0 @@ -from datetime import datetime -from typing import Any -from uuid import uuid4 - -from sqlmodel import JSON, Column, Field, SQLModel - -from policyengine.models import BaselineParameterValue - -from .link import TableLink - - -class BaselineParameterValueTable(SQLModel, table=True): - __tablename__ = "baseline_parameter_values" - __table_args__ = ({"extend_existing": True},) - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - parameter_id: str = Field(nullable=False) # Part of composite foreign key - model_id: str = Field(nullable=False) # Part of composite foreign key - model_version_id: str = Field( - foreign_key="model_versions.id", ondelete="CASCADE" - ) - value: Any | None = Field( - default=None, sa_column=Column(JSON) - ) # JSON field for any type - start_date: datetime = Field(nullable=False) - end_date: datetime | None = Field(default=None) - - -def transform_value_to_table(bpv): - """Transform value for storage, handling special float values.""" - import math - - value = bpv.value - if isinstance(value, float): - if math.isinf(value): - return "Infinity" if value > 0 else "-Infinity" - elif math.isnan(value): - return "NaN" - return value - - -def transform_value_from_table(table_row): - """Transform value from storage, converting special strings back to floats.""" - value = table_row.value - if value == "Infinity": - return float("inf") - elif value == "-Infinity": - return float("-inf") - elif value == "NaN": - return float("nan") - return value - - -baseline_parameter_value_table_link = TableLink( - model_cls=BaselineParameterValue, - table_cls=BaselineParameterValueTable, - model_to_table_custom_transforms=dict( - parameter_id=lambda bpv: bpv.parameter.id, - model_id=lambda bpv: bpv.parameter.model.id, # Add model_id from parameter - model_version_id=lambda bpv: bpv.model_version.id, - value=transform_value_to_table, - ), - table_to_model_custom_transforms=dict( - value=transform_value_from_table, - ), -) diff --git a/src/policyengine/database/baseline_variable_table.py b/src/policyengine/database/baseline_variable_table.py deleted file mode 100644 index 6f7e61e5..00000000 --- a/src/policyengine/database/baseline_variable_table.py +++ /dev/null @@ -1,40 +0,0 @@ -from sqlmodel import Field, SQLModel - -from policyengine.models import BaselineVariable -from policyengine.utils.compress import compress_data, decompress_data - -from .link import TableLink - - -class BaselineVariableTable(SQLModel, table=True): - __tablename__ = "baseline_variables" - __table_args__ = ({"extend_existing": True},) - - id: str = Field(primary_key=True) # Variable name - model_id: str = Field( - primary_key=True, foreign_key="models.id" - ) # Part of composite key - model_version_id: str = Field( - foreign_key="model_versions.id", ondelete="CASCADE" - ) - entity: str = Field(nullable=False) - label: str | None = Field(default=None) - description: str | None = Field(default=None) - data_type: bytes | None = Field(default=None) # Pickled type - - -baseline_variable_table_link = TableLink( - model_cls=BaselineVariable, - table_cls=BaselineVariableTable, - primary_key=("id", "model_id"), # Composite primary key - model_to_table_custom_transforms=dict( - model_id=lambda bv: bv.model_version.model.id, # Add model_id from model_version - model_version_id=lambda bv: bv.model_version.id, - data_type=lambda bv: compress_data(bv.data_type) - if bv.data_type - else None, - ), - table_to_model_custom_transforms=dict( - data_type=lambda dt: decompress_data(dt) if dt else None, - ), -) diff --git a/src/policyengine/database/database.py b/src/policyengine/database/database.py deleted file mode 100644 index 61b03b83..00000000 --- a/src/policyengine/database/database.py +++ /dev/null @@ -1,251 +0,0 @@ -from typing import Any - -from sqlmodel import Session, SQLModel - -from .aggregate import aggregate_table_link -from .baseline_parameter_value_table import baseline_parameter_value_table_link -from .baseline_variable_table import baseline_variable_table_link -from .dataset_table import dataset_table_link -from .dynamic_table import dynamic_table_link -from .link import TableLink - -# Import all table links -from .model_table import model_table_link -from .model_version_table import model_version_table_link -from .parameter_table import parameter_table_link -from .parameter_value_table import parameter_value_table_link -from .policy_table import policy_table_link -from .report_element_table import report_element_table_link -from .report_table import report_table_link -from .simulation_table import simulation_table_link -from .user_table import user_table_link -from .versioned_dataset_table import versioned_dataset_table_link - - -class Database: - url: str - - _model_table_links: list[TableLink] = [] - - def __init__(self, url: str): - self.url = url - self.engine = self._create_engine() - self.session = Session(self.engine) - - for link in [ - model_table_link, - model_version_table_link, - dataset_table_link, - versioned_dataset_table_link, - policy_table_link, - dynamic_table_link, - parameter_table_link, - parameter_value_table_link, - baseline_parameter_value_table_link, - baseline_variable_table_link, - simulation_table_link, - aggregate_table_link, - user_table_link, - report_table_link, - report_element_table_link, - ]: - self.register_table(link) - - def _create_engine(self): - from sqlmodel import create_engine - - return create_engine(self.url, echo=False) - - def create_tables(self): - """Create all database tables.""" - SQLModel.metadata.create_all(self.engine) - - def drop_tables(self): - """Drop all database tables.""" - SQLModel.metadata.drop_all(self.engine) - - def reset(self): - """Drop and recreate all tables.""" - self.drop_tables() - self.create_tables() - - def __enter__(self): - """Context manager entry - creates a session.""" - self.session = Session(self.engine) - return self.session - - def __exit__(self, exc_type, exc_val, exc_tb): - """Context manager exit - closes the session.""" - if exc_type: - self.session.rollback() - else: - self.session.commit() - self.session.close() - - def register_table(self, link: TableLink): - self._model_table_links.append(link) - # Create the table if not exists - link.table_cls.metadata.create_all(self.engine) - - def get(self, model_cls: type, **kwargs): - table_link = next( - ( - link - for link in self._model_table_links - if link.model_cls == model_cls - ), - None, - ) - if table_link is not None: - return table_link.get(self, **kwargs) - - def set(self, object: Any, commit: bool = True): - table_link = next( - ( - link - for link in self._model_table_links - if link.model_cls is type(object) - ), - None, - ) - if table_link is not None: - table_link.set(self, object, commit=commit) - - def register_model_version(self, model_version): - """Register a model version with its model and seed objects. - This replaces all existing parameters, baseline parameter values, - and baseline variables for this model version.""" - # Add or update the model directly to avoid conflicts - from policyengine.utils.compress import compress_data - - from .baseline_parameter_value_table import BaselineParameterValueTable - from .baseline_variable_table import BaselineVariableTable - from .model_table import ModelTable - from .model_version_table import ModelVersionTable - from .parameter_table import ParameterTable - - existing_model = ( - self.session.query(ModelTable) - .filter(ModelTable.id == model_version.model.id) - .first() - ) - if not existing_model: - model_table = ModelTable( - id=model_version.model.id, - name=model_version.model.name, - description=model_version.model.description, - simulation_function=( - lambda m: compress_data(m.simulation_function) - )(model_version.model), - ) - self.session.add(model_table) - self.session.flush() - - # Add or update the model version - existing_version = ( - self.session.query(ModelVersionTable) - .filter(ModelVersionTable.id == model_version.id) - .first() - ) - if not existing_version: - version_table = ModelVersionTable( - id=model_version.id, - model_id=model_version.model.id, - version=model_version.version, - description=model_version.description, - created_at=model_version.created_at, - ) - self.session.add(version_table) - self.session.flush() - - # Get seed objects from the model - seed_objects = model_version.model.create_seed_objects(model_version) - - # Delete ALL existing seed data for this model (not just this version) - # This ensures we start fresh with the new version's data - # Order matters due to foreign key constraints - - # First delete baseline parameter values (they reference parameters) - self.session.query(BaselineParameterValueTable).filter( - BaselineParameterValueTable.model_id == model_version.model.id - ).delete() - - # Then delete baseline variables for this model - self.session.query(BaselineVariableTable).filter( - BaselineVariableTable.model_id == model_version.model.id - ).delete() - - # Finally delete all parameters for this model - self.session.query(ParameterTable).filter( - ParameterTable.model_id == model_version.model.id - ).delete() - - self.session.commit() - - # Add all parameters first - for parameter in seed_objects.parameters: - # We need to add directly to session to avoid the autoflush issue - from .parameter_table import ParameterTable - - param_table = ParameterTable( - id=parameter.id, - model_id=parameter.model.id, # Now required as part of composite key - description=parameter.description, - data_type=parameter.data_type.__name__ - if parameter.data_type - else None, - ) - self.session.add(param_table) - - # Flush parameters to database so they exist for foreign key constraints - self.session.flush() - - # Add all baseline parameter values - for baseline_param_value in seed_objects.baseline_parameter_values: - import math - from uuid import uuid4 - - from .baseline_parameter_value_table import ( - BaselineParameterValueTable, - ) - - # Handle special float values that JSON doesn't support - value = baseline_param_value.value - if isinstance(value, float): - if math.isinf(value): - value = "Infinity" if value > 0 else "-Infinity" - elif math.isnan(value): - value = "NaN" - - bpv_table = BaselineParameterValueTable( - id=str(uuid4()), - parameter_id=baseline_param_value.parameter.id, - model_id=baseline_param_value.parameter.model.id, # Add model_id - model_version_id=baseline_param_value.model_version.id, - value=value, - start_date=baseline_param_value.start_date, - end_date=baseline_param_value.end_date, - ) - self.session.add(bpv_table) - - # Add all baseline variables - for baseline_variable in seed_objects.baseline_variables: - from .baseline_variable_table import BaselineVariableTable - - bv_table = BaselineVariableTable( - id=baseline_variable.id, - model_id=baseline_variable.model_version.model.id, # Add model_id - model_version_id=baseline_variable.model_version.id, - entity=baseline_variable.entity, - label=baseline_variable.label, - description=baseline_variable.description, - data_type=(lambda bv: compress_data(bv.data_type))( - baseline_variable - ) - if baseline_variable.data_type - else None, - ) - self.session.add(bv_table) - - # Commit everything at once - self.session.commit() diff --git a/src/policyengine/database/dataset_table.py b/src/policyengine/database/dataset_table.py deleted file mode 100644 index 4eb4156c..00000000 --- a/src/policyengine/database/dataset_table.py +++ /dev/null @@ -1,41 +0,0 @@ -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.models import Dataset -from policyengine.utils.compress import compress_data, decompress_data - -from .link import TableLink - - -class DatasetTable(SQLModel, table=True): - __tablename__ = "datasets" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - name: str = Field(nullable=False) - description: str | None = Field(default=None) - version: str | None = Field(default=None) - versioned_dataset_id: str | None = Field( - default=None, foreign_key="versioned_datasets.id", ondelete="SET NULL" - ) - year: int | None = Field(default=None) - data: bytes | None = Field(default=None) - model_id: str | None = Field( - default=None, foreign_key="models.id", ondelete="SET NULL" - ) - - -dataset_table_link = TableLink( - model_cls=Dataset, - table_cls=DatasetTable, - model_to_table_custom_transforms=dict( - versioned_dataset_id=lambda d: d.versioned_dataset.id - if d.versioned_dataset - else None, - model_id=lambda d: d.model.id if d.model else None, - data=lambda d: compress_data(d.data) if d.data else None, - ), - table_to_model_custom_transforms=dict( - data=lambda b: decompress_data(b) if b else None, - ), -) diff --git a/src/policyengine/database/dynamic_table.py b/src/policyengine/database/dynamic_table.py deleted file mode 100644 index 6d510afe..00000000 --- a/src/policyengine/database/dynamic_table.py +++ /dev/null @@ -1,34 +0,0 @@ -from datetime import datetime -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.models import Dynamic -from policyengine.utils.compress import compress_data, decompress_data - -from .link import TableLink - - -class DynamicTable(SQLModel, table=True): - __tablename__ = "dynamics" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - name: str = Field(nullable=False) - description: str | None = Field(default=None) - simulation_modifier: bytes | None = Field(default=None) - created_at: datetime = Field(default_factory=datetime.now) - updated_at: datetime = Field(default_factory=datetime.now) - - -dynamic_table_link = TableLink( - model_cls=Dynamic, - table_cls=DynamicTable, - model_to_table_custom_transforms=dict( - simulation_modifier=lambda d: compress_data(d.simulation_modifier) - if d.simulation_modifier - else None, - ), - table_to_model_custom_transforms=dict( - simulation_modifier=lambda b: decompress_data(b) if b else None, - ), -) diff --git a/src/policyengine/database/link.py b/src/policyengine/database/link.py deleted file mode 100644 index f6f19da8..00000000 --- a/src/policyengine/database/link.py +++ /dev/null @@ -1,82 +0,0 @@ -from collections.abc import Callable -from typing import TYPE_CHECKING - -from pydantic import BaseModel -from sqlmodel import SQLModel, select - -if TYPE_CHECKING: - from .database import Database - - -class TableLink(BaseModel): - model_cls: type[BaseModel] - table_cls: type[SQLModel] - model_to_table_custom_transforms: dict[str, Callable] | None = None - table_to_model_custom_transforms: dict[str, Callable] | None = None - primary_key: str | tuple[str, ...] = ( - "id" # Allow multiple strings in tuple - ) - - def get(self, database: "Database", **kwargs): - statement = select(self.table_cls).filter_by(**kwargs) - result = database.session.exec(statement).first() - if result is None: - return None - model_data = result.model_dump() - if self.table_to_model_custom_transforms: - for ( - field, - transform, - ) in self.table_to_model_custom_transforms.items(): - model_data[field] = transform(getattr(result, field)) - - # Only include fields that exist in the model class - valid_fields = { - field_name for field_name in self.model_cls.__annotations__.keys() - } - filtered_model_data = { - k: v for k, v in model_data.items() if k in valid_fields - } - return self.model_cls(**filtered_model_data) - - def set(self, database: "Database", obj: BaseModel, commit: bool = True): - model_data = obj.model_dump() - if self.model_to_table_custom_transforms: - for ( - field, - transform, - ) in self.model_to_table_custom_transforms.items(): - model_data[field] = transform(obj) - # Only include fields that exist in the table class - valid_fields = { - field_name for field_name in self.table_cls.__annotations__.keys() - } - filtered_model_data = { - k: v for k, v in model_data.items() if k in valid_fields - } - table_obj = self.table_cls(**filtered_model_data) - - # Check if already exists using primary key - query = select(self.table_cls) - if isinstance(self.primary_key, tuple): - for key in self.primary_key: - query = query.where( - getattr(self.table_cls, key) == getattr(table_obj, key) - ) - else: - query = query.where( - getattr(self.table_cls, self.primary_key) - == getattr(table_obj, self.primary_key) - ) - - existing = database.session.exec(query).first() - if existing: - # Update existing record - for key, value in filtered_model_data.items(): - setattr(existing, key, value) - database.session.add(existing) - else: - database.session.add(table_obj) - - if commit: - database.session.commit() diff --git a/src/policyengine/database/model_table.py b/src/policyengine/database/model_table.py deleted file mode 100644 index 40d4d2e8..00000000 --- a/src/policyengine/database/model_table.py +++ /dev/null @@ -1,27 +0,0 @@ -from sqlmodel import Field, SQLModel - -from policyengine.models import Model -from policyengine.utils.compress import compress_data, decompress_data - -from .link import TableLink - - -class ModelTable(SQLModel, table=True, extend_existing=True): - __tablename__ = "models" - - id: str = Field(primary_key=True) - name: str = Field(nullable=False) - description: str | None = Field(default=None) - simulation_function: bytes - - -model_table_link = TableLink( - model_cls=Model, - table_cls=ModelTable, - model_to_table_custom_transforms=dict( - simulation_function=lambda m: compress_data(m.simulation_function), - ), - table_to_model_custom_transforms=dict( - simulation_function=lambda b: decompress_data(b), - ), -) diff --git a/src/policyengine/database/model_version_table.py b/src/policyengine/database/model_version_table.py deleted file mode 100644 index fe590ec9..00000000 --- a/src/policyengine/database/model_version_table.py +++ /dev/null @@ -1,28 +0,0 @@ -from datetime import datetime -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.models import ModelVersion - -from .link import TableLink - - -class ModelVersionTable(SQLModel, table=True): - __tablename__ = "model_versions" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - model_id: str = Field(foreign_key="models.id", ondelete="CASCADE") - version: str = Field(nullable=False) - description: str | None = Field(default=None) - created_at: datetime = Field(default_factory=datetime.now) - - -model_version_table_link = TableLink( - model_cls=ModelVersion, - table_cls=ModelVersionTable, - model_to_table_custom_transforms=dict( - model_id=lambda model_version: model_version.model.id, - ), - table_to_model_custom_transforms={}, -) diff --git a/src/policyengine/database/parameter_table.py b/src/policyengine/database/parameter_table.py deleted file mode 100644 index 500484e1..00000000 --- a/src/policyengine/database/parameter_table.py +++ /dev/null @@ -1,31 +0,0 @@ -from sqlmodel import Field, SQLModel - -from policyengine.models import Parameter - -from .link import TableLink - - -class ParameterTable(SQLModel, table=True): - __tablename__ = "parameters" - __table_args__ = ({"extend_existing": True},) - - id: str = Field(primary_key=True) # Parameter name - model_id: str = Field( - primary_key=True, foreign_key="models.id" - ) # Part of composite key - description: str | None = Field(default=None) - data_type: str | None = Field(nullable=True) # Data type name - - -parameter_table_link = TableLink( - model_cls=Parameter, - table_cls=ParameterTable, - primary_key=("id", "model_id"), # Composite primary key - model_to_table_custom_transforms=dict( - data_type=lambda p: p.data_type.__name__ if p.data_type else None, - model_id=lambda p: p.model.id if p.model else None, - ), - table_to_model_custom_transforms=dict( - data_type=lambda t: eval(t.data_type) if t.data_type else None - ), -) diff --git a/src/policyengine/database/parameter_value_table.py b/src/policyengine/database/parameter_value_table.py deleted file mode 100644 index 1bdc19c2..00000000 --- a/src/policyengine/database/parameter_value_table.py +++ /dev/null @@ -1,62 +0,0 @@ -from datetime import datetime -from typing import Any -from uuid import uuid4 - -from sqlmodel import JSON, Column, Field, SQLModel - -from policyengine.models import ParameterValue - -from .link import TableLink - - -class ParameterValueTable(SQLModel, table=True): - __tablename__ = "parameter_values" - __table_args__ = ({"extend_existing": True},) - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - parameter_id: str = Field(nullable=False) # Part of composite foreign key - model_id: str = Field(nullable=False) # Part of composite foreign key - value: Any | None = Field( - default=None, sa_column=Column(JSON) - ) # JSON field for any type - start_date: datetime = Field(nullable=False) - end_date: datetime | None = Field(default=None) - - -def transform_value_to_table(pv): - """Transform value for storage, handling special float values.""" - import math - - value = pv.value - if isinstance(value, float): - if math.isinf(value): - return "Infinity" if value > 0 else "-Infinity" - elif math.isnan(value): - return "NaN" - return value - - -def transform_value_from_table(table_row): - """Transform value from storage, converting special strings back to floats.""" - value = table_row.value - if value == "Infinity": - return float("inf") - elif value == "-Infinity": - return float("-inf") - elif value == "NaN": - return float("nan") - return value - - -parameter_value_table_link = TableLink( - model_cls=ParameterValue, - table_cls=ParameterValueTable, - model_to_table_custom_transforms=dict( - parameter_id=lambda pv: pv.parameter.id, - model_id=lambda pv: pv.parameter.model.id, # Add model_id from parameter - value=transform_value_to_table, - ), - table_to_model_custom_transforms=dict( - value=transform_value_from_table, - ), -) diff --git a/src/policyengine/database/policy_table.py b/src/policyengine/database/policy_table.py deleted file mode 100644 index b8ce5a88..00000000 --- a/src/policyengine/database/policy_table.py +++ /dev/null @@ -1,34 +0,0 @@ -from datetime import datetime -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.models import Policy -from policyengine.utils.compress import compress_data, decompress_data - -from .link import TableLink - - -class PolicyTable(SQLModel, table=True): - __tablename__ = "policies" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - name: str = Field(nullable=False) - description: str | None = Field(default=None) - simulation_modifier: bytes | None = Field(default=None) - created_at: datetime = Field(default_factory=datetime.now) - updated_at: datetime = Field(default_factory=datetime.now) - - -policy_table_link = TableLink( - model_cls=Policy, - table_cls=PolicyTable, - model_to_table_custom_transforms=dict( - simulation_modifier=lambda p: compress_data(p.simulation_modifier) - if p.simulation_modifier - else None, - ), - table_to_model_custom_transforms=dict( - simulation_modifier=lambda b: decompress_data(b) if b else None, - ), -) diff --git a/src/policyengine/database/report_element_table.py b/src/policyengine/database/report_element_table.py deleted file mode 100644 index 477bfcc3..00000000 --- a/src/policyengine/database/report_element_table.py +++ /dev/null @@ -1,48 +0,0 @@ -import uuid -from datetime import datetime - -from sqlmodel import Field, SQLModel - -from policyengine.models.report_element import ReportElement - -from .link import TableLink - - -class ReportElementTable(SQLModel, table=True, extend_existing=True): - __tablename__ = "report_elements" - - id: str = Field( - primary_key=True, default_factory=lambda: str(uuid.uuid4()) - ) - label: str = Field(nullable=False) - type: str = Field(nullable=False) # "chart" or "markdown" - - # Data source - data_table: str | None = Field(default=None) # "aggregates" - - # Chart configuration - chart_type: str | None = Field( - default=None - ) # "bar", "line", "scatter", "area", "pie" - x_axis_variable: str | None = Field(default=None) - y_axis_variable: str | None = Field(default=None) - group_by: str | None = Field(default=None) - color_by: str | None = Field(default=None) - size_by: str | None = Field(default=None) - - # Markdown specific - markdown_content: str | None = Field(default=None) - - # Metadata - report_id: str | None = Field(default=None, foreign_key="reports.id") - user_id: str | None = Field(default=None, foreign_key="users.id") - position: int | None = Field(default=None) - visible: bool | None = Field(default=True) - created_at: datetime = Field(default_factory=datetime.utcnow) - updated_at: datetime = Field(default_factory=datetime.utcnow) - - -report_element_table_link = TableLink( - model_cls=ReportElement, - table_cls=ReportElementTable, -) diff --git a/src/policyengine/database/report_table.py b/src/policyengine/database/report_table.py deleted file mode 100644 index 9ac473b5..00000000 --- a/src/policyengine/database/report_table.py +++ /dev/null @@ -1,24 +0,0 @@ -import uuid -from datetime import datetime - -from sqlmodel import Field, SQLModel - -from policyengine.models.report import Report - -from .link import TableLink - - -class ReportTable(SQLModel, table=True, extend_existing=True): - __tablename__ = "reports" - - id: str = Field( - primary_key=True, default_factory=lambda: str(uuid.uuid4()) - ) - label: str = Field(nullable=False) - created_at: datetime = Field(default_factory=datetime.utcnow) - - -report_table_link = TableLink( - model_cls=Report, - table_cls=ReportTable, -) diff --git a/src/policyengine/database/simulation_table.py b/src/policyengine/database/simulation_table.py deleted file mode 100644 index 483a78be..00000000 --- a/src/policyengine/database/simulation_table.py +++ /dev/null @@ -1,50 +0,0 @@ -from datetime import datetime -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.models import Simulation -from policyengine.utils.compress import compress_data, decompress_data - -from .link import TableLink - - -class SimulationTable(SQLModel, table=True): - __tablename__ = "simulations" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - created_at: datetime = Field(default_factory=datetime.now) - updated_at: datetime = Field(default_factory=datetime.now) - - policy_id: str | None = Field( - default=None, foreign_key="policies.id", ondelete="SET NULL" - ) - dynamic_id: str | None = Field( - default=None, foreign_key="dynamics.id", ondelete="SET NULL" - ) - dataset_id: str = Field(foreign_key="datasets.id", ondelete="CASCADE") - model_id: str = Field(foreign_key="models.id", ondelete="CASCADE") - model_version_id: str | None = Field( - default=None, foreign_key="model_versions.id", ondelete="SET NULL" - ) - - result: bytes | None = Field(default=None) - - -simulation_table_link = TableLink( - model_cls=Simulation, - table_cls=SimulationTable, - model_to_table_custom_transforms=dict( - policy_id=lambda s: s.policy.id if s.policy else None, - dynamic_id=lambda s: s.dynamic.id if s.dynamic else None, - dataset_id=lambda s: s.dataset.id, - model_id=lambda s: s.model.id, - model_version_id=lambda s: s.model_version.id - if s.model_version - else None, - result=lambda s: compress_data(s.result) if s.result else None, - ), - table_to_model_custom_transforms=dict( - result=lambda b: decompress_data(b) if b else None, - ), -) diff --git a/src/policyengine/database/user_table.py b/src/policyengine/database/user_table.py deleted file mode 100644 index 8c79f73a..00000000 --- a/src/policyengine/database/user_table.py +++ /dev/null @@ -1,28 +0,0 @@ -import uuid -from datetime import datetime - -from sqlmodel import Field, SQLModel - -from policyengine.models.user import User - -from .link import TableLink - - -class UserTable(SQLModel, table=True, extend_existing=True): - __tablename__ = "users" - - id: str = Field( - primary_key=True, default_factory=lambda: str(uuid.uuid4()) - ) - username: str = Field(nullable=False, unique=True) - first_name: str | None = Field(default=None) - last_name: str | None = Field(default=None) - email: str | None = Field(default=None) - created_at: datetime = Field(default_factory=datetime.utcnow) - updated_at: datetime = Field(default_factory=datetime.utcnow) - - -user_table_link = TableLink( - model_cls=User, - table_cls=UserTable, -) diff --git a/src/policyengine/database/versioned_dataset_table.py b/src/policyengine/database/versioned_dataset_table.py deleted file mode 100644 index 52aa207b..00000000 --- a/src/policyengine/database/versioned_dataset_table.py +++ /dev/null @@ -1,28 +0,0 @@ -from uuid import uuid4 - -from sqlmodel import Field, SQLModel - -from policyengine.models import VersionedDataset - -from .link import TableLink - - -class VersionedDatasetTable(SQLModel, table=True): - __tablename__ = "versioned_datasets" - - id: str = Field(default_factory=lambda: str(uuid4()), primary_key=True) - name: str = Field(nullable=False) - description: str = Field(nullable=False) - model_id: str | None = Field( - default=None, foreign_key="models.id", ondelete="SET NULL" - ) - - -versioned_dataset_table_link = TableLink( - model_cls=VersionedDataset, - table_cls=VersionedDatasetTable, - model_to_table_custom_transforms=dict( - model_id=lambda vd: vd.model.id if vd.model else None, - ), - table_to_model_custom_transforms={}, -) diff --git a/src/policyengine/models/__init__.py b/src/policyengine/models/__init__.py deleted file mode 100644 index b92592b9..00000000 --- a/src/policyengine/models/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -from .aggregate import Aggregate as Aggregate -from .aggregate import AggregateType as AggregateType -from .baseline_parameter_value import ( - BaselineParameterValue as BaselineParameterValue, -) -from .baseline_variable import BaselineVariable as BaselineVariable -from .dataset import Dataset as Dataset -from .dynamic import Dynamic as Dynamic -from .model import Model as Model -from .model_version import ModelVersion as ModelVersion -from .parameter import Parameter as Parameter -from .parameter_value import ParameterValue as ParameterValue -from .policy import Policy as Policy -from .policyengine_uk import ( - policyengine_uk_latest_version as policyengine_uk_latest_version, -) -from .policyengine_uk import ( - policyengine_uk_model as policyengine_uk_model, -) -from .policyengine_us import ( - policyengine_us_latest_version as policyengine_us_latest_version, -) -from .policyengine_us import ( - policyengine_us_model as policyengine_us_model, -) -from .report import Report as Report -from .report_element import ReportElement as ReportElement -from .simulation import Simulation as Simulation -from .user import User as User -from .versioned_dataset import VersionedDataset as VersionedDataset diff --git a/src/policyengine/models/aggregate.py b/src/policyengine/models/aggregate.py deleted file mode 100644 index b25d9d1a..00000000 --- a/src/policyengine/models/aggregate.py +++ /dev/null @@ -1,92 +0,0 @@ -from enum import Enum -from typing import TYPE_CHECKING, Literal - -import pandas as pd -from microdf import MicroDataFrame -from pydantic import BaseModel - -if TYPE_CHECKING: - from policyengine.models import Simulation - - -class AggregateType(str, Enum): - SUM = "sum" - MEAN = "mean" - COUNT = "count" - - -class Aggregate(BaseModel): - simulation: "Simulation" - entity: str - variable_name: str - year: int | None = None - filter_variable_name: str | None = None - filter_variable_value: str | None = None - filter_variable_leq: float | None = None - filter_variable_geq: float | None = None - aggregate_function: Literal[ - AggregateType.SUM, AggregateType.MEAN, AggregateType.COUNT - ] - - value: float | None = None - - @staticmethod - def run(aggregates: list["Aggregate"]) -> list["Aggregate"]: - # Assumes that all aggregates are from the same simulation - results = [] - - tables = aggregates[0].simulation.result - for table in tables: - tables[table] = pd.DataFrame(tables[table]) - weight_col = f"{table}_weight" - if weight_col in tables[table].columns: - tables[table] = MicroDataFrame( - tables[table], weights=weight_col - ) - - for agg in aggregates: - if agg.entity not in tables: - raise ValueError( - f"Entity {agg.entity} not found in simulation results" - ) - table = tables[agg.entity] - - if agg.variable_name not in table.columns: - raise ValueError( - f"Variable {agg.variable_name} not found in entity {agg.entity}" - ) - - df = table - - if agg.year is None: - agg.year = aggregates[0].simulation.dataset.year - - if agg.filter_variable_name is not None: - if agg.filter_variable_name not in df.columns: - raise ValueError( - f"Filter variable {agg.filter_variable_name} not found in entity {agg.entity}" - ) - if agg.filter_variable_value is not None: - df = df[ - df[agg.filter_variable_name] - == agg.filter_variable_value - ] - if agg.filter_variable_leq is not None: - df = df[ - df[agg.filter_variable_name] <= agg.filter_variable_leq - ] - if agg.filter_variable_geq is not None: - df = df[ - df[agg.filter_variable_name] >= agg.filter_variable_geq - ] - - if agg.aggregate_function == AggregateType.SUM: - agg.value = float(df[agg.variable_name].sum()) - elif agg.aggregate_function == AggregateType.MEAN: - agg.value = float(df[agg.variable_name].mean()) - elif agg.aggregate_function == AggregateType.COUNT: - agg.value = float((df[agg.variable_name] > 0).sum()) - - results.append(agg) - - return results diff --git a/src/policyengine/models/baseline_parameter_value.py b/src/policyengine/models/baseline_parameter_value.py deleted file mode 100644 index 65cd4aba..00000000 --- a/src/policyengine/models/baseline_parameter_value.py +++ /dev/null @@ -1,14 +0,0 @@ -from datetime import datetime - -from pydantic import BaseModel - -from .model_version import ModelVersion -from .parameter import Parameter - - -class BaselineParameterValue(BaseModel): - parameter: Parameter - model_version: ModelVersion - value: float | int | str | bool | list | None = None - start_date: datetime - end_date: datetime | None = None diff --git a/src/policyengine/models/baseline_variable.py b/src/policyengine/models/baseline_variable.py deleted file mode 100644 index b0e739b1..00000000 --- a/src/policyengine/models/baseline_variable.py +++ /dev/null @@ -1,12 +0,0 @@ -from pydantic import BaseModel - -from .model_version import ModelVersion - - -class BaselineVariable(BaseModel): - id: str - model_version: ModelVersion - entity: str - label: str | None = None - description: str | None = None - data_type: type | None = None diff --git a/src/policyengine/models/dataset.py b/src/policyengine/models/dataset.py deleted file mode 100644 index 59dd626f..00000000 --- a/src/policyengine/models/dataset.py +++ /dev/null @@ -1,18 +0,0 @@ -from typing import Any -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .model import Model -from .versioned_dataset import VersionedDataset - - -class Dataset(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - name: str - description: str | None = None - version: str | None = None - versioned_dataset: VersionedDataset | None = None - year: int | None = None - data: Any | None = None - model: Model | None = None diff --git a/src/policyengine/models/model.py b/src/policyengine/models/model.py deleted file mode 100644 index e898f489..00000000 --- a/src/policyengine/models/model.py +++ /dev/null @@ -1,124 +0,0 @@ -from collections.abc import Callable -from datetime import datetime -from typing import TYPE_CHECKING - -from pydantic import BaseModel - -if TYPE_CHECKING: - from .baseline_parameter_value import BaselineParameterValue - from .baseline_variable import BaselineVariable - from .parameter import Parameter - - -class Model(BaseModel): - id: str - name: str - description: str | None = None - simulation_function: Callable - - def create_seed_objects(self, model_version): - from policyengine_core.parameters import Parameter as CoreParameter - - from .baseline_parameter_value import BaselineParameterValue - from .baseline_variable import BaselineVariable - from .parameter import Parameter - - if self.id == "policyengine_uk": - from policyengine_uk.tax_benefit_system import system - elif self.id == "policyengine_us": - from policyengine_us.system import system - else: - raise ValueError("Unsupported model.") - - parameters = [] - baseline_parameter_values = [] - baseline_variables = [] - seen_parameter_ids = set() - - for parameter in system.parameters.get_descendants(): - # Skip if we've already processed this parameter ID - if parameter.name in seen_parameter_ids: - continue - seen_parameter_ids.add(parameter.name) - param = Parameter( - id=parameter.name, - description=parameter.description, - data_type=None, - model=self, - ) - parameters.append(param) - if isinstance(parameter, CoreParameter): - values = parameter.values_list[::-1] - param.data_type = type(values[-1].value) - for i in range(len(values)): - value_at_instant = values[i] - instant_str = safe_parse_instant_str( - value_at_instant.instant_str - ) - if i + 1 < len(values): - next_instant_str = safe_parse_instant_str( - values[i + 1].instant_str - ) - else: - next_instant_str = None - baseline_param_value = BaselineParameterValue( - parameter=param, - model_version=model_version, - value=value_at_instant.value, - start_date=instant_str, - end_date=next_instant_str, - ) - baseline_parameter_values.append(baseline_param_value) - - for variable in system.variables.values(): - baseline_variable = BaselineVariable( - id=variable.name, - model_version=model_version, - entity=variable.entity.key, - label=variable.label, - description=variable.documentation, - data_type=variable.value_type, - ) - baseline_variables.append(baseline_variable) - - return SeedObjects( - parameters=parameters, - baseline_parameter_values=baseline_parameter_values, - baseline_variables=baseline_variables, - ) - - -def safe_parse_instant_str(instant_str: str) -> datetime: - if instant_str == "0000-01-01": - return datetime(1, 1, 1) - else: - try: - return datetime.strptime(instant_str, "%Y-%m-%d") - except ValueError: - # Handle invalid dates like 2021-06-31 - # Try to parse year and month, then use last valid day - parts = instant_str.split("-") - if len(parts) == 3: - year = int(parts[0]) - month = int(parts[1]) - day = int(parts[2]) - - # Find the last valid day of the month - import calendar - - last_day = calendar.monthrange(year, month)[1] - if day > last_day: - print( - f"Warning: Invalid date {instant_str}, using {year}-{month:02d}-{last_day:02d}" - ) - return datetime(year, month, last_day) - - # If we can't parse it at all, print and raise - print(f"Error: Cannot parse date {instant_str}") - raise - - -class SeedObjects(BaseModel): - parameters: list["Parameter"] - baseline_parameter_values: list["BaselineParameterValue"] - baseline_variables: list["BaselineVariable"] diff --git a/src/policyengine/models/model_version.py b/src/policyengine/models/model_version.py deleted file mode 100644 index 18b542f8..00000000 --- a/src/policyengine/models/model_version.py +++ /dev/null @@ -1,14 +0,0 @@ -from datetime import datetime -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .model import Model - - -class ModelVersion(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - model: Model - version: str - description: str | None = None - created_at: datetime = Field(default_factory=datetime.now) diff --git a/src/policyengine/models/policyengine_uk.py b/src/policyengine/models/policyengine_uk.py deleted file mode 100644 index 22b089a8..00000000 --- a/src/policyengine/models/policyengine_uk.py +++ /dev/null @@ -1,114 +0,0 @@ -import importlib.metadata - -import pandas as pd - -from ..models import Dataset, Dynamic, Model, ModelVersion, Policy - - -def run_policyengine_uk( - dataset: "Dataset", - policy: "Policy | None" = None, - dynamic: "Dynamic | None" = None, -) -> dict[str, "pd.DataFrame"]: - data: dict[str, pd.DataFrame] = dataset.data - - from policyengine_uk import Microsimulation - from policyengine_uk.data import UKSingleYearDataset - - pe_input_data = UKSingleYearDataset( - person=data["person"], - benunit=data["benunit"], - household=data["household"], - fiscal_year=dataset.year, - ) - - sim = Microsimulation(dataset=pe_input_data) - sim.default_calculation_period = dataset.year - - def simulation_modifier(sim: Microsimulation): - if policy is not None and len(policy.parameter_values) > 0: - for parameter_value in policy.parameter_values: - sim.tax_benefit_system.parameters.get_child( - parameter_value.parameter.id - ).update( - value=parameter_value.value, - start=parameter_value.start_date.strftime("%Y-%m-%d"), - stop=parameter_value.end_date.strftime("%Y-%m-%d") - if parameter_value.end_date - else None, - ) - - if dynamic is not None and len(dynamic.parameter_values) > 0: - for parameter_value in dynamic.parameter_values: - sim.tax_benefit_system.parameters.get_child( - parameter_value.parameter.id - ).update( - value=parameter_value.value, - start=parameter_value.start_date.strftime("%Y-%m-%d"), - stop=parameter_value.end_date.strftime("%Y-%m-%d") - if parameter_value.end_date - else None, - ) - - if dynamic is not None and dynamic.simulation_modifier is not None: - dynamic.simulation_modifier(sim) - if policy is not None and policy.simulation_modifier is not None: - policy.simulation_modifier(sim) - - simulation_modifier(sim) - - # Skip reforms for now - - output_data = {} - - variable_blacklist = [ # TEMPORARY: we need to fix policyengine-uk to make these only take a long time with non-default parameters set to true. - "is_uc_entitled_baseline", - "income_elasticity_lsr", - "child_benefit_opts_out", - "housing_benefit_baseline_entitlement", - "baseline_ctc_entitlement", - "pre_budget_change_household_tax", - "pre_budget_change_household_net_income", - "is_on_cliff", - "marginal_tax_rate_on_capital_gains", - "relative_capital_gains_mtr_change", - "pre_budget_change_ons_equivalised_income_decile", - "substitution_elasticity", - "marginal_tax_rate", - "cliff_evaluated", - "cliff_gap", - "substitution_elasticity_lsr", - "relative_wage_change", - "relative_income_change", - "pre_budget_change_household_benefits", - ] - - for entity in ["person", "benunit", "household"]: - output_data[entity] = pd.DataFrame() - for variable in sim.tax_benefit_system.variables.values(): - correct_entity = variable.entity.key == entity - if variable.name in variable_blacklist: - continue - if variable.definition_period != "year": - continue - if correct_entity: - output_data[entity][variable.name] = sim.calculate( - variable.name - ) - - return output_data - - -policyengine_uk_model = Model( - id="policyengine_uk", - name="PolicyEngine UK", - description="PolicyEngine's open-source tax-benefit microsimulation model.", - simulation_function=run_policyengine_uk, -) - -# Get policyengine-uk version - -policyengine_uk_latest_version = ModelVersion( - model=policyengine_uk_model, - version=importlib.metadata.distribution("policyengine_uk").version, -) diff --git a/src/policyengine/models/policyengine_us.py b/src/policyengine/models/policyengine_us.py deleted file mode 100644 index 8886f0b8..00000000 --- a/src/policyengine/models/policyengine_us.py +++ /dev/null @@ -1,115 +0,0 @@ -import importlib.metadata - -import pandas as pd - -from ..models import Dataset, Dynamic, Model, ModelVersion, Policy - - -def run_policyengine_us( - dataset: "Dataset", - policy: "Policy | None" = None, - dynamic: "Dynamic | None" = None, -) -> dict[str, "pd.DataFrame"]: - data: dict[str, pd.DataFrame] = dataset.data - - person_df = pd.DataFrame() - - for table_name, table in data.items(): - if table_name == "person": - for col in table.columns: - person_df[f"{col}__{dataset.year}"] = table[col].values - else: - foreign_key = data["person"][f"person_{table_name}_id"] - primary_key = data[table_name][f"{table_name}_id"] - - projected = table.set_index(primary_key).loc[foreign_key] - - for col in projected.columns: - person_df[f"{col}__{dataset.year}"] = projected[col].values - - from policyengine_us import Microsimulation - - sim = Microsimulation(dataset=person_df) - sim.default_calculation_period = dataset.year - - def simulation_modifier(sim: Microsimulation): - if policy is not None and len(policy.parameter_values) > 0: - for parameter_value in policy.parameter_values: - sim.tax_benefit_system.parameters.get_child( - parameter_value.parameter.id - ).update( - parameter_value.value, - start=parameter_value.start_date.strftime("%Y-%m-%d"), - stop=parameter_value.end_date.strftime("%Y-%m-%d") - if parameter_value.end_date - else None, - ) - - if dynamic is not None and len(dynamic.parameter_values) > 0: - for parameter_value in dynamic.parameter_values: - sim.tax_benefit_system.parameters.get_child( - parameter_value.parameter.id - ).update( - parameter_value.value, - start=parameter_value.start_date.strftime("%Y-%m-%d"), - stop=parameter_value.end_date.strftime("%Y-%m-%d") - if parameter_value.end_date - else None, - ) - - if dynamic is not None and dynamic.simulation_modifier is not None: - dynamic.simulation_modifier(sim) - if policy is not None and policy.simulation_modifier is not None: - policy.simulation_modifier(sim) - - simulation_modifier(sim) - - # Skip reforms for now - - output_data = {} - - variable_whitelist = [ - "household_net_income", - ] - - for variable in variable_whitelist: - sim.calculate(variable) - - for entity in [ - "person", - "marital_unit", - "family", - "tax_unit", - "spm_unit", - "household", - ]: - output_data[entity] = pd.DataFrame() - for variable in sim.tax_benefit_system.variables.values(): - correct_entity = variable.entity.key == entity - if str(dataset.year) not in list( - map(str, sim.get_holder(variable.name).get_known_periods()) - ): - continue - if variable.definition_period != "year": - continue - if not correct_entity: - continue - output_data[entity][variable.name] = sim.calculate(variable.name) - - return output_data - - -policyengine_us_model = Model( - id="policyengine_us", - name="PolicyEngine US", - description="PolicyEngine's open-source tax-benefit microsimulation model.", - simulation_function=run_policyengine_us, -) - -# Get policyengine-uk version - - -policyengine_us_latest_version = ModelVersion( - model=policyengine_us_model, - version=importlib.metadata.distribution("policyengine_us").version, -) diff --git a/src/policyengine/models/report.py b/src/policyengine/models/report.py deleted file mode 100644 index 6a6442b3..00000000 --- a/src/policyengine/models/report.py +++ /dev/null @@ -1,10 +0,0 @@ -import uuid -from datetime import datetime - -from pydantic import BaseModel, Field - - -class Report(BaseModel): - id: str = Field(default_factory=lambda: str(uuid.uuid4())) - label: str - created_at: datetime | None = None diff --git a/src/policyengine/models/report_element.py b/src/policyengine/models/report_element.py deleted file mode 100644 index 63055fd7..00000000 --- a/src/policyengine/models/report_element.py +++ /dev/null @@ -1,36 +0,0 @@ -import uuid -from datetime import datetime -from typing import Literal - -from pydantic import BaseModel, Field - - -class ReportElement(BaseModel): - id: str = Field(default_factory=lambda: str(uuid.uuid4())) - label: str - type: Literal["chart", "markdown"] - - # Data source - data_table: Literal["aggregates"] | None = None # Which table to pull from - - # Chart configuration - chart_type: ( - Literal["bar", "line", "scatter", "area", "pie", "histogram"] | None - ) = None - x_axis_variable: str | None = None # Column name from the table - y_axis_variable: str | None = None # Column name from the table - group_by: str | None = None # Column to group/split series by - color_by: str | None = None # Column for color mapping - size_by: str | None = None # Column for size mapping (bubble charts) - - # Markdown specific - markdown_content: str | None = None - - # Metadata - report_id: str | None = None - user_id: str | None = None - position: int | None = None - visible: bool | None = True - custom_config: dict | None = None # Additional chart-specific config - created_at: datetime | None = None - updated_at: datetime | None = None diff --git a/src/policyengine/models/user.py b/src/policyengine/models/user.py deleted file mode 100644 index dee924e1..00000000 --- a/src/policyengine/models/user.py +++ /dev/null @@ -1,14 +0,0 @@ -import uuid -from datetime import datetime - -from pydantic import BaseModel, Field - - -class User(BaseModel): - id: str = Field(default_factory=lambda: str(uuid.uuid4())) - username: str - first_name: str | None = None - last_name: str | None = None - email: str | None = None - created_at: datetime | None = None - updated_at: datetime | None = None diff --git a/src/policyengine/models/versioned_dataset.py b/src/policyengine/models/versioned_dataset.py deleted file mode 100644 index 2f5e14f7..00000000 --- a/src/policyengine/models/versioned_dataset.py +++ /dev/null @@ -1,12 +0,0 @@ -from uuid import uuid4 - -from pydantic import BaseModel, Field - -from .model import Model - - -class VersionedDataset(BaseModel): - id: str = Field(default_factory=lambda: str(uuid4())) - name: str - description: str - model: Model | None = None diff --git a/src/policyengine/outputs/__init__.py b/src/policyengine/outputs/__init__.py new file mode 100644 index 00000000..8997578d --- /dev/null +++ b/src/policyengine/outputs/__init__.py @@ -0,0 +1,21 @@ +from policyengine.core import Output, OutputCollection +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) +from policyengine.outputs.decile_impact import ( + DecileImpact, + calculate_decile_impacts, +) + +__all__ = [ + "Output", + "OutputCollection", + "Aggregate", + "AggregateType", + "ChangeAggregate", + "ChangeAggregateType", + "DecileImpact", + "calculate_decile_impacts", +] diff --git a/src/policyengine/outputs/aggregate.py b/src/policyengine/outputs/aggregate.py new file mode 100644 index 00000000..2d41259c --- /dev/null +++ b/src/policyengine/outputs/aggregate.py @@ -0,0 +1,124 @@ +from enum import Enum +from typing import Any + +from policyengine.core import Output, Simulation + + +class AggregateType(str, Enum): + SUM = "sum" + MEAN = "mean" + COUNT = "count" + + +class Aggregate(Output): + simulation: Simulation + variable: str + aggregate_type: AggregateType + entity: str | None = None + + filter_variable: str | None = None + filter_variable_eq: Any | None = None + filter_variable_leq: Any | None = None + filter_variable_geq: Any | None = None + filter_variable_describes_quantiles: bool = False + + # Convenient quantile specification (alternative to describes_quantiles) + quantile: int | None = ( + None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) + ) + quantile_eq: int | None = None # Exact quantile (e.g., 3 for 3rd decile) + quantile_leq: int | None = ( + None # Maximum quantile (e.g., 5 for bottom 5 deciles) + ) + quantile_geq: int | None = ( + None # Minimum quantile (e.g., 9 for top 2 deciles) + ) + + result: Any | None = None + + def run(self): + # Convert quantile specification to describes_quantiles format + if self.quantile is not None: + self.filter_variable_describes_quantiles = True + if self.quantile_eq is not None: + # For a specific quantile, filter between (quantile-1)/n and quantile/n + self.filter_variable_geq = ( + self.quantile_eq - 1 + ) / self.quantile + self.filter_variable_leq = self.quantile_eq / self.quantile + elif self.quantile_leq is not None: + self.filter_variable_leq = self.quantile_leq / self.quantile + elif self.quantile_geq is not None: + self.filter_variable_geq = ( + self.quantile_geq - 1 + ) / self.quantile + + # Get variable object + var_obj = next( + v + for v in self.simulation.tax_benefit_model_version.variables + if v.name == self.variable + ) + + # Get the target entity data + target_entity = self.entity or var_obj.entity + data = getattr(self.simulation.output_dataset.data, target_entity) + + # Map variable to target entity if needed + if var_obj.entity != target_entity: + mapped = self.simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity, columns=[self.variable] + ) + series = mapped[self.variable] + else: + series = data[self.variable] + + # Apply filters + if self.filter_variable is not None: + filter_var_obj = next( + v + for v in self.simulation.tax_benefit_model_version.variables + if v.name == self.filter_variable + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = ( + self.simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, + target_entity, + columns=[self.filter_variable], + ) + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = data[self.filter_variable] + + if self.filter_variable_describes_quantiles: + if self.filter_variable_eq is not None: + threshold = filter_series.quantile(self.filter_variable_eq) + series = series[filter_series <= threshold] + if self.filter_variable_leq is not None: + threshold = filter_series.quantile( + self.filter_variable_leq + ) + series = series[filter_series <= threshold] + if self.filter_variable_geq is not None: + threshold = filter_series.quantile( + self.filter_variable_geq + ) + series = series[filter_series >= threshold] + else: + if self.filter_variable_eq is not None: + series = series[filter_series == self.filter_variable_eq] + if self.filter_variable_leq is not None: + series = series[filter_series <= self.filter_variable_leq] + if self.filter_variable_geq is not None: + series = series[filter_series >= self.filter_variable_geq] + + # Aggregate - MicroSeries will automatically apply weights + if self.aggregate_type == AggregateType.SUM: + self.result = series.sum() + elif self.aggregate_type == AggregateType.MEAN: + self.result = series.mean() + elif self.aggregate_type == AggregateType.COUNT: + self.result = series.count() diff --git a/src/policyengine/outputs/change_aggregate.py b/src/policyengine/outputs/change_aggregate.py new file mode 100644 index 00000000..b5bfe2df --- /dev/null +++ b/src/policyengine/outputs/change_aggregate.py @@ -0,0 +1,184 @@ +from enum import Enum +from typing import Any + +from policyengine.core import Output, Simulation + + +class ChangeAggregateType(str, Enum): + COUNT = "count" + SUM = "sum" + MEAN = "mean" + + +class ChangeAggregate(Output): + baseline_simulation: Simulation + reform_simulation: Simulation + variable: str + aggregate_type: ChangeAggregateType + entity: str | None = None + + # Filter by absolute change + change_geq: float | None = None # Change >= value (e.g., gain >= 500) + change_leq: float | None = None # Change <= value (e.g., loss <= -500) + change_eq: float | None = None # Change == value + + # Filter by relative change (as decimal, e.g., 0.05 = 5%) + relative_change_geq: float | None = None # Relative change >= value + relative_change_leq: float | None = None # Relative change <= value + relative_change_eq: float | None = None # Relative change == value + + # Filter by another variable (e.g., only count people with age >= 30) + filter_variable: str | None = None + filter_variable_eq: Any | None = None + filter_variable_leq: Any | None = None + filter_variable_geq: Any | None = None + filter_variable_describes_quantiles: bool = False + + # Convenient quantile specification (alternative to describes_quantiles) + quantile: int | None = ( + None # Number of quantiles (e.g., 10 for deciles, 5 for quintiles) + ) + quantile_eq: int | None = None # Exact quantile (e.g., 3 for 3rd decile) + quantile_leq: int | None = ( + None # Maximum quantile (e.g., 5 for bottom 5 deciles) + ) + quantile_geq: int | None = ( + None # Minimum quantile (e.g., 9 for top 2 deciles) + ) + + result: Any | None = None + + def run(self): + # Convert quantile specification to describes_quantiles format + if self.quantile is not None: + self.filter_variable_describes_quantiles = True + if self.quantile_eq is not None: + # For a specific quantile, filter between (quantile-1)/n and quantile/n + self.filter_variable_geq = ( + self.quantile_eq - 1 + ) / self.quantile + self.filter_variable_leq = self.quantile_eq / self.quantile + elif self.quantile_leq is not None: + self.filter_variable_leq = self.quantile_leq / self.quantile + elif self.quantile_geq is not None: + self.filter_variable_geq = ( + self.quantile_geq - 1 + ) / self.quantile + + # Get variable object + var_obj = next( + v + for v in self.baseline_simulation.tax_benefit_model_version.variables + if v.name == self.variable + ) + + # Get the target entity data + target_entity = self.entity or var_obj.entity + baseline_data = getattr( + self.baseline_simulation.output_dataset.data, target_entity + ) + reform_data = getattr( + self.reform_simulation.output_dataset.data, target_entity + ) + + # Map variable to target entity if needed + if var_obj.entity != target_entity: + baseline_mapped = ( + self.baseline_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + ) + baseline_series = baseline_mapped[self.variable] + + reform_mapped = ( + self.reform_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + ) + reform_series = reform_mapped[self.variable] + else: + baseline_series = baseline_data[self.variable] + reform_series = reform_data[self.variable] + + # Calculate change (reform - baseline) + change_series = reform_series - baseline_series + + # Calculate relative change (handling division by zero) + # Where baseline is 0, relative change is undefined; we'll mask these out if relative filters are used + import numpy as np + + with np.errstate(divide="ignore", invalid="ignore"): + relative_change_series = change_series / baseline_series + relative_change_series = relative_change_series.replace( + [np.inf, -np.inf], np.nan + ) + + # Start with all rows + mask = baseline_series.notna() + + # Apply absolute change filters + if self.change_eq is not None: + mask &= change_series == self.change_eq + if self.change_leq is not None: + mask &= change_series <= self.change_leq + if self.change_geq is not None: + mask &= change_series >= self.change_geq + + # Apply relative change filters + if self.relative_change_eq is not None: + mask &= relative_change_series == self.relative_change_eq + if self.relative_change_leq is not None: + mask &= relative_change_series <= self.relative_change_leq + if self.relative_change_geq is not None: + mask &= relative_change_series >= self.relative_change_geq + + # Apply filter_variable filters + if self.filter_variable is not None: + filter_var_obj = next( + v + for v in self.baseline_simulation.tax_benefit_model_version.variables + if v.name == self.filter_variable + ) + + if filter_var_obj.entity != target_entity: + filter_mapped = ( + self.baseline_simulation.output_dataset.data.map_to_entity( + filter_var_obj.entity, target_entity + ) + ) + filter_series = filter_mapped[self.filter_variable] + else: + filter_series = baseline_data[self.filter_variable] + + if self.filter_variable_describes_quantiles: + if self.filter_variable_eq is not None: + threshold = filter_series.quantile(self.filter_variable_eq) + mask &= filter_series <= threshold + if self.filter_variable_leq is not None: + threshold = filter_series.quantile( + self.filter_variable_leq + ) + mask &= filter_series <= threshold + if self.filter_variable_geq is not None: + threshold = filter_series.quantile( + self.filter_variable_geq + ) + mask &= filter_series >= threshold + else: + if self.filter_variable_eq is not None: + mask &= filter_series == self.filter_variable_eq + if self.filter_variable_leq is not None: + mask &= filter_series <= self.filter_variable_leq + if self.filter_variable_geq is not None: + mask &= filter_series >= self.filter_variable_geq + + # Apply mask to get filtered data + filtered_change = change_series[mask] + + # Aggregate + if self.aggregate_type == ChangeAggregateType.COUNT: + self.result = filtered_change.count() + elif self.aggregate_type == ChangeAggregateType.SUM: + self.result = filtered_change.sum() + elif self.aggregate_type == ChangeAggregateType.MEAN: + self.result = filtered_change.mean() diff --git a/src/policyengine/outputs/decile_impact.py b/src/policyengine/outputs/decile_impact.py new file mode 100644 index 00000000..8fcc8579 --- /dev/null +++ b/src/policyengine/outputs/decile_impact.py @@ -0,0 +1,140 @@ +import pandas as pd +from pydantic import ConfigDict + +from policyengine.core import Output, OutputCollection, Simulation + + +class DecileImpact(Output): + """Single decile's impact from a policy reform - represents one database row.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: Simulation + reform_simulation: Simulation + income_variable: str = "equiv_hbai_household_net_income" + entity: str | None = None + decile: int + quantiles: int = 10 + + # Results populated by run() + baseline_mean: float | None = None + reform_mean: float | None = None + absolute_change: float | None = None + relative_change: float | None = None + count_better_off: float | None = None + count_worse_off: float | None = None + count_no_change: float | None = None + + def run(self): + """Calculate impact for this specific decile.""" + # Get variable object to determine entity + var_obj = next( + v + for v in self.baseline_simulation.tax_benefit_model_version.variables + if v.name == self.income_variable + ) + + # Get target entity + target_entity = self.entity or var_obj.entity + + # Get data from both simulations + baseline_data = getattr( + self.baseline_simulation.output_dataset.data, target_entity + ) + reform_data = getattr( + self.reform_simulation.output_dataset.data, target_entity + ) + + # Map income variable to target entity if needed + if var_obj.entity != target_entity: + baseline_mapped = ( + self.baseline_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + ) + baseline_income = baseline_mapped[self.income_variable] + + reform_mapped = ( + self.reform_simulation.output_dataset.data.map_to_entity( + var_obj.entity, target_entity + ) + ) + reform_income = reform_mapped[self.income_variable] + else: + baseline_income = baseline_data[self.income_variable] + reform_income = reform_data[self.income_variable] + + # Calculate deciles based on baseline income + decile_series = ( + pd.qcut( + baseline_income, + self.quantiles, + labels=False, + duplicates="drop", + ) + + 1 + ) + + # Calculate changes + absolute_change = reform_income - baseline_income + relative_change = (absolute_change / baseline_income) * 100 + + # Filter to this decile + mask = decile_series == self.decile + + # Populate results + self.baseline_mean = float(baseline_income[mask].mean()) + self.reform_mean = float(reform_income[mask].mean()) + self.absolute_change = float(absolute_change[mask].mean()) + self.relative_change = float(relative_change[mask].mean()) + self.count_better_off = float((absolute_change[mask] > 0).sum()) + self.count_worse_off = float((absolute_change[mask] < 0).sum()) + self.count_no_change = float((absolute_change[mask] == 0).sum()) + + +def calculate_decile_impacts( + baseline_simulation: Simulation, + reform_simulation: Simulation, + income_variable: str = "equiv_hbai_household_net_income", + entity: str | None = None, + quantiles: int = 10, +) -> OutputCollection[DecileImpact]: + """Calculate decile-by-decile impact of a reform. + + Returns: + OutputCollection containing list of DecileImpact objects and DataFrame + """ + results = [] + for decile in range(1, quantiles + 1): + impact = DecileImpact( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + income_variable=income_variable, + entity=entity, + decile=decile, + quantiles=quantiles, + ) + impact.run() + results.append(impact) + + # Create DataFrame + df = pd.DataFrame( + [ + { + "baseline_simulation_id": r.baseline_simulation.id, + "reform_simulation_id": r.reform_simulation.id, + "income_variable": r.income_variable, + "decile": r.decile, + "baseline_mean": r.baseline_mean, + "reform_mean": r.reform_mean, + "absolute_change": r.absolute_change, + "relative_change": r.relative_change, + "count_better_off": r.count_better_off, + "count_worse_off": r.count_worse_off, + "count_no_change": r.count_no_change, + } + for r in results + ] + ) + + return OutputCollection(outputs=results, dataframe=df) diff --git a/src/policyengine/tax_benefit_models/uk.py b/src/policyengine/tax_benefit_models/uk.py new file mode 100644 index 00000000..a9fb102a --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk.py @@ -0,0 +1,33 @@ +"""PolicyEngine UK tax-benefit model - imports from uk/ module.""" + +from .uk import ( + PolicyEngineUK, + PolicyEngineUKDataset, + PolicyEngineUKLatest, + ProgrammeStatistics, + UKYearData, + create_datasets, + general_policy_reform_analysis, + uk_latest, + uk_model, +) + +__all__ = [ + "UKYearData", + "PolicyEngineUKDataset", + "create_datasets", + "PolicyEngineUK", + "PolicyEngineUKLatest", + "uk_model", + "uk_latest", + "general_policy_reform_analysis", + "ProgrammeStatistics", +] + +# Rebuild models to resolve forward references +from policyengine.core import Dataset + +Dataset.model_rebuild() +UKYearData.model_rebuild() +PolicyEngineUKDataset.model_rebuild() +PolicyEngineUKLatest.model_rebuild() diff --git a/src/policyengine/tax_benefit_models/uk/__init__.py b/src/policyengine/tax_benefit_models/uk/__init__.py new file mode 100644 index 00000000..ade6e531 --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk/__init__.py @@ -0,0 +1,26 @@ +"""PolicyEngine UK tax-benefit model.""" + +from .analysis import general_policy_reform_analysis +from .datasets import PolicyEngineUKDataset, UKYearData, create_datasets +from .model import PolicyEngineUK, PolicyEngineUKLatest, uk_latest, uk_model +from .outputs import ProgrammeStatistics + +__all__ = [ + "UKYearData", + "PolicyEngineUKDataset", + "create_datasets", + "PolicyEngineUK", + "PolicyEngineUKLatest", + "uk_model", + "uk_latest", + "general_policy_reform_analysis", + "ProgrammeStatistics", +] + +# Rebuild models to resolve forward references +from policyengine.core import Dataset + +Dataset.model_rebuild() +UKYearData.model_rebuild() +PolicyEngineUKDataset.model_rebuild() +PolicyEngineUKLatest.model_rebuild() diff --git a/src/policyengine/tax_benefit_models/uk/analysis.py b/src/policyengine/tax_benefit_models/uk/analysis.py new file mode 100644 index 00000000..40805bf2 --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk/analysis.py @@ -0,0 +1,97 @@ +"""General utility functions for UK policy reform analysis.""" + +import pandas as pd +from pydantic import BaseModel + +from policyengine.core import OutputCollection, Simulation +from policyengine.outputs.decile_impact import ( + DecileImpact, + calculate_decile_impacts, +) + +from .outputs import ProgrammeStatistics + + +class PolicyReformAnalysis(BaseModel): + """Complete policy reform analysis result.""" + + decile_impacts: OutputCollection[DecileImpact] + programme_statistics: OutputCollection[ProgrammeStatistics] + + +def general_policy_reform_analysis( + baseline_simulation: Simulation, + reform_simulation: Simulation, +) -> PolicyReformAnalysis: + """Perform comprehensive analysis of a policy reform. + + Returns: + PolicyReformAnalysis containing decile impacts and programme statistics + """ + # Decile impact + decile_impacts = calculate_decile_impacts( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + ) + + # Major programmes to analyse + programmes = { + # Tax + "income_tax": {"entity": "person", "is_tax": True}, + "national_insurance": {"entity": "person", "is_tax": True}, + "vat": {"entity": "household", "is_tax": True}, + "council_tax": {"entity": "household", "is_tax": True}, + # Benefits + "universal_credit": {"entity": "person", "is_tax": False}, + "child_benefit": {"entity": "person", "is_tax": False}, + "pension_credit": {"entity": "person", "is_tax": False}, + "income_support": {"entity": "person", "is_tax": False}, + "working_tax_credit": {"entity": "person", "is_tax": False}, + "child_tax_credit": {"entity": "person", "is_tax": False}, + } + + programme_statistics = [] + + for programme_name, programme_info in programmes.items(): + entity = programme_info["entity"] + is_tax = programme_info["is_tax"] + + stats = ProgrammeStatistics( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + programme_name=programme_name, + entity=entity, + is_tax=is_tax, + ) + stats.run() + programme_statistics.append(stats) + + # Create DataFrame + programme_df = pd.DataFrame( + [ + { + "baseline_simulation_id": p.baseline_simulation.id, + "reform_simulation_id": p.reform_simulation.id, + "programme_name": p.programme_name, + "entity": p.entity, + "is_tax": p.is_tax, + "baseline_total": p.baseline_total, + "reform_total": p.reform_total, + "change": p.change, + "baseline_count": p.baseline_count, + "reform_count": p.reform_count, + "winners": p.winners, + "losers": p.losers, + } + for p in programme_statistics + ] + ) + + programme_collection = OutputCollection( + outputs=programme_statistics, dataframe=programme_df + ) + + return PolicyReformAnalysis( + decile_impacts=decile_impacts, + programme_statistics=programme_collection, + ) diff --git a/src/policyengine/tax_benefit_models/uk/datasets.py b/src/policyengine/tax_benefit_models/uk/datasets.py new file mode 100644 index 00000000..113d4b57 --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk/datasets.py @@ -0,0 +1,176 @@ +from pathlib import Path + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, ConfigDict + +from policyengine.core import Dataset, map_to_entity + + +class UKYearData(BaseModel): + """Entity-level data for a single year.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + person: MicroDataFrame + benunit: MicroDataFrame + household: MicroDataFrame + + def map_to_entity( + self, source_entity: str, target_entity: str, columns: list[str] = None + ) -> MicroDataFrame: + """Map data from source entity to target entity using join keys. + + Args: + source_entity (str): The source entity name ('person', 'benunit', 'household'). + target_entity (str): The target entity name ('person', 'benunit', 'household'). + columns (list[str], optional): List of column names to map. If None, maps all columns. + + Returns: + MicroDataFrame: The mapped data at the target entity level. + + Raises: + ValueError: If source or target entity is invalid. + """ + entity_data = { + "person": self.person, + "benunit": self.benunit, + "household": self.household, + } + return map_to_entity( + entity_data=entity_data, + source_entity=source_entity, + target_entity=target_entity, + person_entity="person", + columns=columns, + ) + + +class PolicyEngineUKDataset(Dataset): + """UK dataset with multi-year entity-level data.""" + + data: UKYearData | None = None + + def model_post_init(self, __context): + """Called after Pydantic initialization.""" + # Make sure we are synchronised between in-memory and storage, at least on initialisation + if self.data is not None: + self.save() + elif self.filepath and not self.data: + try: + self.load() + except FileNotFoundError: + # File doesn't exist yet, that's OK + pass + + def save(self) -> None: + """Save dataset to HDF5 file.""" + filepath = Path(self.filepath) + if not filepath.parent.exists(): + filepath.parent.mkdir(parents=True, exist_ok=True) + with pd.HDFStore(filepath, mode="w") as store: + store["person"] = pd.DataFrame(self.data.person) + store["benunit"] = pd.DataFrame(self.data.benunit) + store["household"] = pd.DataFrame(self.data.household) + + def load(self) -> None: + """Load dataset from HDF5 file into this instance.""" + filepath = self.filepath + with pd.HDFStore(filepath, mode="r") as store: + self.data = UKYearData( + person=MicroDataFrame( + store["person"], weights="person_weight" + ), + benunit=MicroDataFrame( + store["benunit"], weights="benunit_weight" + ), + household=MicroDataFrame( + store["household"], weights="household_weight" + ), + ) + + def __repr__(self) -> str: + if self.data is None: + return f"" + else: + n_people = len(self.data.person) + n_benunits = len(self.data.benunit) + n_households = len(self.data.household) + return f"" + + +def create_datasets( + datasets: list[str] = [ + "hf://policyengine/policyengine-uk-data/frs_2023_24.h5", + "hf://policyengine/policyengine-uk-data/enhanced_frs_2023_24.h5", + ], + years: list[int] = [2026, 2027, 2028, 2029, 2030], +) -> None: + for dataset in datasets: + from policyengine_uk import Microsimulation + + sim = Microsimulation(dataset=dataset) + for year in years: + year_dataset = sim.dataset[year] + + # Convert to pandas DataFrames and add weight columns + person_df = pd.DataFrame(year_dataset.person) + benunit_df = pd.DataFrame(year_dataset.benunit) + household_df = pd.DataFrame(year_dataset.household) + + # Map household weights to person and benunit levels + person_df = person_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + person_df = person_df.rename( + columns={"household_weight": "person_weight"} + ) + person_df = person_df.drop(columns=["household_id"]) + + # Get household_id for each benunit from person table + benunit_household_map = person_df[ + ["person_benunit_id", "person_household_id"] + ].drop_duplicates() + benunit_df = benunit_df.merge( + benunit_household_map, + left_on="benunit_id", + right_on="person_benunit_id", + how="left", + ) + benunit_df = benunit_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + benunit_df = benunit_df.rename( + columns={"household_weight": "benunit_weight"} + ) + benunit_df = benunit_df.drop( + columns=[ + "person_benunit_id", + "person_household_id", + "household_id", + ], + errors="ignore", + ) + + uk_dataset = PolicyEngineUKDataset( + name=f"{dataset}-year-{year}", + description=f"UK Dataset for year {year} based on {dataset}", + filepath=f"./data/{Path(dataset).stem}_year_{year}.h5", + year=year, + data=UKYearData( + person=MicroDataFrame(person_df, weights="person_weight"), + benunit=MicroDataFrame( + benunit_df, weights="benunit_weight" + ), + household=MicroDataFrame( + household_df, weights="household_weight" + ), + ), + ) + uk_dataset.save() diff --git a/src/policyengine/tax_benefit_models/uk/model.py b/src/policyengine/tax_benefit_models/uk/model.py new file mode 100644 index 00000000..18f1ef25 --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk/model.py @@ -0,0 +1,268 @@ +import datetime +from importlib.metadata import version +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd +import requests +from microdf import MicroDataFrame + +from policyengine.core import ( + Parameter, + ParameterValue, + TaxBenefitModel, + TaxBenefitModelVersion, + Variable, +) +from policyengine.utils import parse_safe_date + +from .datasets import PolicyEngineUKDataset, UKYearData + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class PolicyEngineUK(TaxBenefitModel): + id: str = "policyengine-uk" + description: str = "The UK's open-source dynamic tax and benefit microsimulation model maintained by PolicyEngine." + + +uk_model = PolicyEngineUK() + +pkg_version = version("policyengine-uk") + +# Get published time from PyPI +response = requests.get("https://pypi.org/pypi/policyengine-uk/json") +data = response.json() +upload_time = data["releases"][pkg_version][0]["upload_time_iso_8601"] + + +class PolicyEngineUKLatest(TaxBenefitModelVersion): + model: TaxBenefitModel = uk_model + version: str = pkg_version + created_at: datetime.datetime = datetime.datetime.fromisoformat( + upload_time + ) + + def __init__(self, **kwargs: dict): + super().__init__(**kwargs) + from policyengine_core.enums import Enum + from policyengine_uk.system import system + + self.id = f"{self.model.id}@{self.version}" + + self.variables = [] + for var_obj in system.variables.values(): + variable = Variable( + id=self.id + "-" + var_obj.name, + name=var_obj.name, + tax_benefit_model_version=self, + entity=var_obj.entity.key, + description=var_obj.documentation, + data_type=var_obj.value_type + if var_obj.value_type is not Enum + else str, + ) + if ( + hasattr(var_obj, "possible_values") + and var_obj.possible_values is not None + ): + variable.possible_values = list( + map( + lambda x: x.name, + var_obj.possible_values._value2member_map_.values(), + ) + ) + self.variables.append(variable) + + self.parameters = [] + from policyengine_core.parameters import Parameter as CoreParameter + + for param_node in system.parameters.get_descendants(): + if isinstance(param_node, CoreParameter): + parameter = Parameter( + id=self.id + "-" + param_node.name, + name=param_node.name, + tax_benefit_model_version=self, + description=param_node.description, + data_type=type( + param_node(2025) + ), # Example year to infer type + unit=param_node.metadata.get("unit"), + ) + self.parameters.append(parameter) + + for i in range(len(param_node.values_list)): + param_at_instant = param_node.values_list[i] + if i + 1 < len(param_node.values_list): + next_instant = param_node.values_list[i + 1] + else: + next_instant = None + parameter_value = ParameterValue( + parameter=parameter, + start_date=parse_safe_date( + param_at_instant.instant_str + ), + end_date=parse_safe_date(next_instant.instant_str) + if next_instant + else None, + value=param_at_instant.value, + ) + self.parameter_values.append(parameter_value) + + def run(self, simulation: "Simulation") -> "Simulation": + from policyengine_uk import Microsimulation + from policyengine_uk.data import UKSingleYearDataset + + from policyengine.utils.parametric_reforms import ( + simulation_modifier_from_parameter_values, + ) + + assert isinstance(simulation.dataset, PolicyEngineUKDataset) + + dataset = simulation.dataset + dataset.load() + input_data = UKSingleYearDataset( + person=dataset.data.person, + benunit=dataset.data.benunit, + household=dataset.data.household, + fiscal_year=dataset.year, + ) + microsim = Microsimulation(dataset=input_data) + + if ( + simulation.policy + and simulation.policy.simulation_modifier is not None + ): + simulation.policy.simulation_modifier(microsim) + elif simulation.policy: + modifier = simulation_modifier_from_parameter_values( + simulation.policy.parameter_values + ) + modifier(microsim) + + if ( + simulation.dynamic + and simulation.dynamic.simulation_modifier is not None + ): + simulation.dynamic.simulation_modifier(microsim) + elif simulation.dynamic: + modifier = simulation_modifier_from_parameter_values( + simulation.dynamic.parameter_values + ) + modifier(microsim) + + # Allow custom variable selection, or use defaults + if simulation.variables is not None: + entity_variables = simulation.variables + else: + # Default comprehensive variable set + entity_variables = { + "person": [ + # IDs and weights + "person_id", + "benunit_id", + "household_id", + "person_weight", + # Demographics + "age", + "gender", + "is_adult", + "is_SP_age", + "is_child", + # Income + "employment_income", + "self_employment_income", + "pension_income", + "private_pension_income", + "savings_interest_income", + "dividend_income", + "property_income", + "total_income", + "earned_income", + # Benefits + "universal_credit", + "child_benefit", + "pension_credit", + "income_support", + "working_tax_credit", + "child_tax_credit", + # Tax + "income_tax", + "national_insurance", + ], + "benunit": [ + # IDs and weights + "benunit_id", + "benunit_weight", + # Structure + "family_type", + # Income and benefits + "universal_credit", + "child_benefit", + "working_tax_credit", + "child_tax_credit", + ], + "household": [ + # IDs and weights + "household_id", + "household_weight", + # Income measures + "household_net_income", + "hbai_household_net_income", + "equiv_hbai_household_net_income", + "household_market_income", + "household_gross_income", + # Benefits and tax + "household_benefits", + "household_tax", + "vat", + # Housing + "rent", + "council_tax", + "tenure_type", + ], + } + + data = { + "person": pd.DataFrame(), + "benunit": pd.DataFrame(), + "household": pd.DataFrame(), + } + + for entity, variables in entity_variables.items(): + for var in variables: + data[entity][var] = microsim.calculate( + var, period=simulation.dataset.year, map_to=entity + ).values + + data["person"] = MicroDataFrame( + data["person"], weights="person_weight" + ) + data["benunit"] = MicroDataFrame( + data["benunit"], weights="benunit_weight" + ) + data["household"] = MicroDataFrame( + data["household"], weights="household_weight" + ) + + simulation.output_dataset = PolicyEngineUKDataset( + name=dataset.name, + description=dataset.description, + filepath=str( + Path(simulation.dataset.filepath).parent + / (simulation.id + ".h5") + ), + year=simulation.dataset.year, + is_output_dataset=True, + data=UKYearData( + person=data["person"], + benunit=data["benunit"], + household=data["household"], + ), + ) + + simulation.output_dataset.save() + + +uk_latest = PolicyEngineUKLatest() diff --git a/src/policyengine/tax_benefit_models/uk/outputs.py b/src/policyengine/tax_benefit_models/uk/outputs.py new file mode 100644 index 00000000..cc9ee82d --- /dev/null +++ b/src/policyengine/tax_benefit_models/uk/outputs.py @@ -0,0 +1,108 @@ +"""UK-specific output templates.""" + +from typing import TYPE_CHECKING + +from pydantic import ConfigDict + +from policyengine.core import Output +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class ProgrammeStatistics(Output): + """Single programme's statistics from a policy reform - represents one database row.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: "Simulation" + reform_simulation: "Simulation" + programme_name: str + entity: str + is_tax: bool = False + + # Results populated by run() + baseline_total: float | None = None + reform_total: float | None = None + change: float | None = None + baseline_count: float | None = None + reform_count: float | None = None + winners: float | None = None + losers: float | None = None + + def run(self): + """Calculate statistics for this programme.""" + # Baseline totals + baseline_total = Aggregate( + simulation=self.baseline_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + baseline_total.run() + + # Reform totals + reform_total = Aggregate( + simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + reform_total.run() + + # Count of recipients/payers (baseline) + baseline_count = Aggregate( + simulation=self.baseline_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.programme_name, + filter_variable_geq=0.01, + ) + baseline_count.run() + + # Count of recipients/payers (reform) + reform_count = Aggregate( + simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.programme_name, + filter_variable_geq=0.01, + ) + reform_count.run() + + # Winners and losers + winners = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_geq=0.01 if not self.is_tax else -0.01, + ) + winners.run() + + losers = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.programme_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_leq=-0.01 if not self.is_tax else 0.01, + ) + losers.run() + + # Populate results + self.baseline_total = float(baseline_total.result) + self.reform_total = float(reform_total.result) + self.change = float(reform_total.result - baseline_total.result) + self.baseline_count = float(baseline_count.result) + self.reform_count = float(reform_count.result) + self.winners = float(winners.result) + self.losers = float(losers.result) diff --git a/src/policyengine/tax_benefit_models/us.py b/src/policyengine/tax_benefit_models/us.py new file mode 100644 index 00000000..c915a3b5 --- /dev/null +++ b/src/policyengine/tax_benefit_models/us.py @@ -0,0 +1,32 @@ +"""PolicyEngine US tax-benefit model - imports from us/ module.""" + +from importlib.util import find_spec + +if find_spec("policyengine_us") is not None: + from .us import ( + PolicyEngineUS, + PolicyEngineUSDataset, + PolicyEngineUSLatest, + ProgramStatistics, + USYearData, + general_policy_reform_analysis, + us_latest, + us_model, + ) + + __all__ = [ + "USYearData", + "PolicyEngineUSDataset", + "PolicyEngineUS", + "PolicyEngineUSLatest", + "us_model", + "us_latest", + "general_policy_reform_analysis", + "ProgramStatistics", + ] + + # Rebuild models to resolve forward references + PolicyEngineUSDataset.model_rebuild() + PolicyEngineUSLatest.model_rebuild() +else: + __all__ = [] diff --git a/src/policyengine/tax_benefit_models/us/__init__.py b/src/policyengine/tax_benefit_models/us/__init__.py new file mode 100644 index 00000000..63361789 --- /dev/null +++ b/src/policyengine/tax_benefit_models/us/__init__.py @@ -0,0 +1,36 @@ +"""PolicyEngine US tax-benefit model.""" + +from importlib.util import find_spec + +if find_spec("policyengine_us") is not None: + from policyengine.core import Dataset + + from .analysis import general_policy_reform_analysis + from .datasets import PolicyEngineUSDataset, USYearData, create_datasets + from .model import ( + PolicyEngineUS, + PolicyEngineUSLatest, + us_latest, + us_model, + ) + from .outputs import ProgramStatistics + + # Rebuild Pydantic models to resolve forward references + Dataset.model_rebuild() + USYearData.model_rebuild() + PolicyEngineUSDataset.model_rebuild() + PolicyEngineUSLatest.model_rebuild() + + __all__ = [ + "USYearData", + "PolicyEngineUSDataset", + "create_datasets", + "PolicyEngineUS", + "PolicyEngineUSLatest", + "us_model", + "us_latest", + "general_policy_reform_analysis", + "ProgramStatistics", + ] +else: + __all__ = [] diff --git a/src/policyengine/tax_benefit_models/us/analysis.py b/src/policyengine/tax_benefit_models/us/analysis.py new file mode 100644 index 00000000..c3098d45 --- /dev/null +++ b/src/policyengine/tax_benefit_models/us/analysis.py @@ -0,0 +1,99 @@ +"""General utility functions for US policy reform analysis.""" + +import pandas as pd +from pydantic import BaseModel + +from policyengine.core import OutputCollection, Simulation +from policyengine.outputs.decile_impact import ( + DecileImpact, + calculate_decile_impacts, +) + +from .outputs import ProgramStatistics + + +class PolicyReformAnalysis(BaseModel): + """Complete policy reform analysis result.""" + + decile_impacts: OutputCollection[DecileImpact] + program_statistics: OutputCollection[ProgramStatistics] + + +def general_policy_reform_analysis( + baseline_simulation: Simulation, + reform_simulation: Simulation, +) -> PolicyReformAnalysis: + """Perform comprehensive analysis of a policy reform. + + Returns: + PolicyReformAnalysis containing decile impacts and program statistics + """ + # Decile impact (using household_net_income for US) + decile_impacts = calculate_decile_impacts( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + income_variable="household_net_income", + ) + + # Major programs to analyse + programs = { + # Federal taxes + "income_tax": {"entity": "tax_unit", "is_tax": True}, + "payroll_tax": {"entity": "person", "is_tax": True}, + # State and local taxes + "state_income_tax": {"entity": "tax_unit", "is_tax": True}, + # Benefits + "snap": {"entity": "spm_unit", "is_tax": False}, + "tanf": {"entity": "spm_unit", "is_tax": False}, + "ssi": {"entity": "person", "is_tax": False}, + "social_security": {"entity": "person", "is_tax": False}, + "medicare": {"entity": "person", "is_tax": False}, + "medicaid": {"entity": "person", "is_tax": False}, + "eitc": {"entity": "tax_unit", "is_tax": False}, + "ctc": {"entity": "tax_unit", "is_tax": False}, + } + + program_statistics = [] + + for program_name, program_info in programs.items(): + entity = program_info["entity"] + is_tax = program_info["is_tax"] + + stats = ProgramStatistics( + baseline_simulation=baseline_simulation, + reform_simulation=reform_simulation, + program_name=program_name, + entity=entity, + is_tax=is_tax, + ) + stats.run() + program_statistics.append(stats) + + # Create DataFrame + program_df = pd.DataFrame( + [ + { + "baseline_simulation_id": p.baseline_simulation.id, + "reform_simulation_id": p.reform_simulation.id, + "program_name": p.program_name, + "entity": p.entity, + "is_tax": p.is_tax, + "baseline_total": p.baseline_total, + "reform_total": p.reform_total, + "change": p.change, + "baseline_count": p.baseline_count, + "reform_count": p.reform_count, + "winners": p.winners, + "losers": p.losers, + } + for p in program_statistics + ] + ) + + program_collection = OutputCollection( + outputs=program_statistics, dataframe=program_df + ) + + return PolicyReformAnalysis( + decile_impacts=decile_impacts, program_statistics=program_collection + ) diff --git a/src/policyengine/tax_benefit_models/us/datasets.py b/src/policyengine/tax_benefit_models/us/datasets.py new file mode 100644 index 00000000..676e08e3 --- /dev/null +++ b/src/policyengine/tax_benefit_models/us/datasets.py @@ -0,0 +1,307 @@ +import warnings +from pathlib import Path + +import pandas as pd +from microdf import MicroDataFrame +from pydantic import BaseModel, ConfigDict + +from policyengine.core import Dataset, map_to_entity + + +class USYearData(BaseModel): + """Entity-level data for a single year.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + person: MicroDataFrame + marital_unit: MicroDataFrame + family: MicroDataFrame + spm_unit: MicroDataFrame + tax_unit: MicroDataFrame + household: MicroDataFrame + + def map_to_entity( + self, source_entity: str, target_entity: str, columns: list[str] = None + ) -> MicroDataFrame: + """Map data from source entity to target entity using join keys. + + Args: + source_entity (str): The source entity name. + target_entity (str): The target entity name. + columns (list[str], optional): List of column names to map. If None, maps all columns. + + Returns: + MicroDataFrame: The mapped data at the target entity level. + + Raises: + ValueError: If source or target entity is invalid. + """ + entity_data = { + "person": self.person, + "marital_unit": self.marital_unit, + "family": self.family, + "spm_unit": self.spm_unit, + "tax_unit": self.tax_unit, + "household": self.household, + } + return map_to_entity( + entity_data=entity_data, + source_entity=source_entity, + target_entity=target_entity, + person_entity="person", + columns=columns, + ) + + +class PolicyEngineUSDataset(Dataset): + """US dataset with multi-year entity-level data.""" + + data: USYearData | None = None + + def model_post_init(self, __context) -> None: + """Called after Pydantic initialization.""" + # Make sure we are synchronised between in-memory and storage, at least on initialisation + if self.data is not None: + self.save() + elif self.filepath and not self.data: + try: + self.load() + except FileNotFoundError: + # File doesn't exist yet, that's OK + pass + + def save(self) -> None: + """Save dataset to HDF5 file.""" + filepath = Path(self.filepath) + if not filepath.parent.exists(): + filepath.parent.mkdir(parents=True, exist_ok=True) + with warnings.catch_warnings(): + warnings.filterwarnings( + "ignore", + category=pd.errors.PerformanceWarning, + message=".*PyTables will pickle object types.*", + ) + with pd.HDFStore(filepath, mode="w") as store: + store["person"] = pd.DataFrame(self.data.person) + store["marital_unit"] = pd.DataFrame(self.data.marital_unit) + store["family"] = pd.DataFrame(self.data.family) + store["spm_unit"] = pd.DataFrame(self.data.spm_unit) + store["tax_unit"] = pd.DataFrame(self.data.tax_unit) + store["household"] = pd.DataFrame(self.data.household) + + def load(self) -> None: + """Load dataset from HDF5 file into this instance.""" + filepath = self.filepath + with pd.HDFStore(filepath, mode="r") as store: + self.data = USYearData( + person=MicroDataFrame( + store["person"], weights="person_weight" + ), + marital_unit=MicroDataFrame( + store["marital_unit"], weights="marital_unit_weight" + ), + family=MicroDataFrame( + store["family"], weights="family_weight" + ), + spm_unit=MicroDataFrame( + store["spm_unit"], weights="spm_unit_weight" + ), + tax_unit=MicroDataFrame( + store["tax_unit"], weights="tax_unit_weight" + ), + household=MicroDataFrame( + store["household"], weights="household_weight" + ), + ) + + def __repr__(self) -> str: + if self.data is None: + return f"" + else: + n_people = len(self.data.person) + n_marital_units = len(self.data.marital_unit) + n_families = len(self.data.family) + n_spm_units = len(self.data.spm_unit) + n_tax_units = len(self.data.tax_unit) + n_households = len(self.data.household) + return f"" + + +def create_datasets( + datasets: list[str] = [ + "hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5", + ], + years: list[int] = [2024, 2025, 2026, 2027, 2028], +) -> None: + """Create PolicyEngineUSDataset instances from HuggingFace dataset paths. + + Args: + datasets: List of HuggingFace dataset paths (e.g., "hf://policyengine/policyengine-us-data/cps_2024.h5") + years: List of years to extract data for + """ + from policyengine_us import Microsimulation + + for dataset in datasets: + sim = Microsimulation(dataset=dataset) + + for year in years: + # Get all input variables from the simulation + # We'll calculate each input variable for the specified year + entity_data = { + "person": {}, + "household": {}, + "marital_unit": {}, + "family": {}, + "spm_unit": {}, + "tax_unit": {}, + } + + # First, get ID columns which are structural (not input variables) + # These define entity membership and relationships + # For person-level links to group entities, use person_X_id naming + id_variables = { + "person": [ + "person_id", + "person_household_id", + "person_marital_unit_id", + "person_family_id", + "person_spm_unit_id", + "person_tax_unit_id", + ], + "household": ["household_id"], + "marital_unit": ["marital_unit_id"], + "family": ["family_id"], + "spm_unit": ["spm_unit_id"], + "tax_unit": ["tax_unit_id"], + } + + for entity_key, var_names in id_variables.items(): + for id_var in var_names: + if id_var in sim.tax_benefit_system.variables: + values = sim.calculate(id_var, period=year).values + entity_data[entity_key][id_var] = values + + # Get input variables and calculate them for this year + for variable_name in sim.input_variables: + variable = sim.tax_benefit_system.variables[variable_name] + entity_key = variable.entity.key + + # Calculate the variable for the given year + values = sim.calculate(variable_name, period=year).values + + # Store in the appropriate entity dictionary + entity_data[entity_key][variable_name] = values + + # Build entity DataFrames + person_df = pd.DataFrame(entity_data["person"]) + household_df = pd.DataFrame(entity_data["household"]) + marital_unit_df = pd.DataFrame(entity_data["marital_unit"]) + family_df = pd.DataFrame(entity_data["family"]) + spm_unit_df = pd.DataFrame(entity_data["spm_unit"]) + tax_unit_df = pd.DataFrame(entity_data["tax_unit"]) + + # Add weight columns - household weights are primary, map to all entities + # Person weights = household weights (mapped via person_household_id) + if "household_weight" in household_df.columns: + # Only add person_weight if it doesn't already exist + if "person_weight" not in person_df.columns: + person_df = person_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + person_df = person_df.rename( + columns={"household_weight": "person_weight"} + ) + person_df = person_df.drop( + columns=["household_id"], errors="ignore" + ) + + # Map household weights to other group entities via person table + for entity_name, entity_df, person_id_col, entity_id_col in [ + ( + "marital_unit", + marital_unit_df, + "person_marital_unit_id", + "marital_unit_id", + ), + ("family", family_df, "person_family_id", "family_id"), + ( + "spm_unit", + spm_unit_df, + "person_spm_unit_id", + "spm_unit_id", + ), + ( + "tax_unit", + tax_unit_df, + "person_tax_unit_id", + "tax_unit_id", + ), + ]: + # Only add entity weight if it doesn't already exist + if f"{entity_name}_weight" not in entity_df.columns: + # Get household_id for each entity from person table + entity_household_map = person_df[ + [person_id_col, "person_household_id"] + ].drop_duplicates() + entity_df = entity_df.merge( + entity_household_map, + left_on=entity_id_col, + right_on=person_id_col, + how="left", + ) + entity_df = entity_df.merge( + household_df[["household_id", "household_weight"]], + left_on="person_household_id", + right_on="household_id", + how="left", + ) + entity_df = entity_df.rename( + columns={ + "household_weight": f"{entity_name}_weight" + } + ) + entity_df = entity_df.drop( + columns=[ + "household_id", + "person_household_id", + person_id_col, + ], + errors="ignore", + ) + + # Update the entity_data + if entity_name == "marital_unit": + marital_unit_df = entity_df + elif entity_name == "family": + family_df = entity_df + elif entity_name == "spm_unit": + spm_unit_df = entity_df + elif entity_name == "tax_unit": + tax_unit_df = entity_df + + us_dataset = PolicyEngineUSDataset( + name=f"{dataset}-year-{year}", + description=f"US Dataset for year {year} based on {dataset}", + filepath=f"./data/{Path(dataset).stem}_year_{year}.h5", + year=year, + data=USYearData( + person=MicroDataFrame(person_df, weights="person_weight"), + household=MicroDataFrame( + household_df, weights="household_weight" + ), + marital_unit=MicroDataFrame( + marital_unit_df, weights="marital_unit_weight" + ), + family=MicroDataFrame(family_df, weights="family_weight"), + spm_unit=MicroDataFrame( + spm_unit_df, weights="spm_unit_weight" + ), + tax_unit=MicroDataFrame( + tax_unit_df, weights="tax_unit_weight" + ), + ), + ) + us_dataset.save() diff --git a/src/policyengine/tax_benefit_models/us/model.py b/src/policyengine/tax_benefit_models/us/model.py new file mode 100644 index 00000000..5e2068c5 --- /dev/null +++ b/src/policyengine/tax_benefit_models/us/model.py @@ -0,0 +1,447 @@ +import datetime +from importlib.metadata import version +from pathlib import Path +from typing import TYPE_CHECKING + +import pandas as pd +import requests +from microdf import MicroDataFrame + +from policyengine.core import ( + Parameter, + ParameterValue, + TaxBenefitModel, + TaxBenefitModelVersion, + Variable, +) +from policyengine.utils import parse_safe_date + +from .datasets import PolicyEngineUSDataset, USYearData + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class PolicyEngineUS(TaxBenefitModel): + id: str = "policyengine-us" + description: str = "The US's open-source dynamic tax and benefit microsimulation model maintained by PolicyEngine." + + +us_model = PolicyEngineUS() + + +def _get_us_package_metadata(): + """Get PolicyEngine US package version and upload time (lazy-loaded).""" + pkg_version = version("policyengine-us") + # Get published time from PyPI + response = requests.get("https://pypi.org/pypi/policyengine-us/json") + data = response.json() + upload_time = data["releases"][pkg_version][0]["upload_time_iso_8601"] + return pkg_version, upload_time + + +class PolicyEngineUSLatest(TaxBenefitModelVersion): + model: TaxBenefitModel = us_model + version: str = None + created_at: datetime.datetime = None + + def __init__(self, **kwargs: dict): + # Lazy-load package metadata if not provided + if "version" not in kwargs or kwargs.get("version") is None: + pkg_version, upload_time = _get_us_package_metadata() + kwargs["version"] = pkg_version + kwargs["created_at"] = datetime.datetime.fromisoformat(upload_time) + + super().__init__(**kwargs) + from policyengine_core.enums import Enum + from policyengine_us.system import system + + self.id = f"{self.model.id}@{self.version}" + + self.variables = [] + for var_obj in system.variables.values(): + variable = Variable( + id=self.id + "-" + var_obj.name, + name=var_obj.name, + tax_benefit_model_version=self, + entity=var_obj.entity.key, + description=var_obj.documentation, + data_type=var_obj.value_type + if var_obj.value_type is not Enum + else str, + ) + if ( + hasattr(var_obj, "possible_values") + and var_obj.possible_values is not None + ): + variable.possible_values = list( + map( + lambda x: x.name, + var_obj.possible_values._value2member_map_.values(), + ) + ) + self.variables.append(variable) + + self.parameters = [] + from policyengine_core.parameters import Parameter as CoreParameter + + for param_node in system.parameters.get_descendants(): + if isinstance(param_node, CoreParameter): + parameter = Parameter( + id=self.id + "-" + param_node.name, + name=param_node.name, + tax_benefit_model_version=self, + description=param_node.description, + data_type=type(param_node(2025)), + unit=param_node.metadata.get("unit"), + ) + self.parameters.append(parameter) + + for i in range(len(param_node.values_list)): + param_at_instant = param_node.values_list[i] + if i + 1 < len(param_node.values_list): + next_instant = param_node.values_list[i + 1] + else: + next_instant = None + parameter_value = ParameterValue( + parameter=parameter, + start_date=parse_safe_date( + param_at_instant.instant_str + ), + end_date=parse_safe_date(next_instant.instant_str) + if next_instant + else None, + value=param_at_instant.value, + ) + self.parameter_values.append(parameter_value) + + def run(self, simulation: "Simulation") -> "Simulation": + from policyengine_us import Microsimulation + from policyengine_us.system import system + + from policyengine.utils.parametric_reforms import ( + simulation_modifier_from_parameter_values, + ) + + assert isinstance(simulation.dataset, PolicyEngineUSDataset) + + dataset = simulation.dataset + dataset.load() + + # Build simulation from entity IDs using PolicyEngine Core pattern + microsim = Microsimulation() + self._build_simulation_from_dataset(microsim, dataset, system) + + # Apply policy reforms + if ( + simulation.policy + and simulation.policy.simulation_modifier is not None + ): + simulation.policy.simulation_modifier(microsim) + elif simulation.policy: + modifier = simulation_modifier_from_parameter_values( + simulation.policy.parameter_values + ) + modifier(microsim) + + # Apply dynamic reforms + if ( + simulation.dynamic + and simulation.dynamic.simulation_modifier is not None + ): + simulation.dynamic.simulation_modifier(microsim) + elif simulation.dynamic: + modifier = simulation_modifier_from_parameter_values( + simulation.dynamic.parameter_values + ) + modifier(microsim) + + # Allow custom variable selection, or use defaults + if simulation.variables is not None: + entity_variables = simulation.variables + else: + # Default comprehensive variable set + entity_variables = { + "person": [ + # IDs and weights + "person_id", + "marital_unit_id", + "family_id", + "spm_unit_id", + "tax_unit_id", + "household_id", + "person_weight", + # Demographics + "age", + # Income + "employment_income", + # Benefits + "ssi", + "social_security", + "medicaid", + "unemployment_compensation", + ], + "marital_unit": [ + "marital_unit_id", + "marital_unit_weight", + ], + "family": [ + "family_id", + "family_weight", + ], + "spm_unit": [ + "spm_unit_id", + "spm_unit_weight", + "snap", + "tanf", + "spm_unit_net_income", + ], + "tax_unit": [ + "tax_unit_id", + "tax_unit_weight", + "income_tax", + "employee_payroll_tax", + "eitc", + "ctc", + ], + "household": [ + "household_id", + "household_weight", + "household_net_income", + "household_benefits", + "household_tax", + "household_market_income", + ], + } + + data = { + "person": pd.DataFrame(), + "marital_unit": pd.DataFrame(), + "family": pd.DataFrame(), + "spm_unit": pd.DataFrame(), + "tax_unit": pd.DataFrame(), + "household": pd.DataFrame(), + } + + # ID columns should be preserved from input dataset, not calculated + id_columns = { + "person_id", + "household_id", + "marital_unit_id", + "family_id", + "spm_unit_id", + "tax_unit_id", + } + weight_columns = { + "person_weight", + "household_weight", + "marital_unit_weight", + "family_weight", + "spm_unit_weight", + "tax_unit_weight", + } + + # First, copy ID and weight columns from input dataset + for entity in data.keys(): + input_df = pd.DataFrame(getattr(dataset.data, entity)) + entity_id_col = f"{entity}_id" + entity_weight_col = f"{entity}_weight" + + if entity_id_col in input_df.columns: + data[entity][entity_id_col] = input_df[entity_id_col].values + if entity_weight_col in input_df.columns: + data[entity][entity_weight_col] = input_df[ + entity_weight_col + ].values + + # For person entity, also copy person-level group ID columns + person_input_df = pd.DataFrame(dataset.data.person) + for col in person_input_df.columns: + if col.startswith("person_") and col.endswith("_id"): + # Map person_household_id -> household_id, etc. + target_col = col.replace("person_", "") + if target_col in id_columns: + data["person"][target_col] = person_input_df[col].values + + # Then calculate non-ID, non-weight variables from simulation + for entity, variables in entity_variables.items(): + for var in variables: + if var not in id_columns and var not in weight_columns: + data[entity][var] = microsim.calculate( + var, period=simulation.dataset.year, map_to=entity + ).values + + data["person"] = MicroDataFrame( + data["person"], weights="person_weight" + ) + data["marital_unit"] = MicroDataFrame( + data["marital_unit"], weights="marital_unit_weight" + ) + data["family"] = MicroDataFrame( + data["family"], weights="family_weight" + ) + data["spm_unit"] = MicroDataFrame( + data["spm_unit"], weights="spm_unit_weight" + ) + data["tax_unit"] = MicroDataFrame( + data["tax_unit"], weights="tax_unit_weight" + ) + data["household"] = MicroDataFrame( + data["household"], weights="household_weight" + ) + + simulation.output_dataset = PolicyEngineUSDataset( + name=dataset.name, + description=dataset.description, + filepath=str( + Path(simulation.dataset.filepath).parent + / (simulation.id + ".h5") + ), + year=simulation.dataset.year, + is_output_dataset=True, + data=USYearData( + person=data["person"], + marital_unit=data["marital_unit"], + family=data["family"], + spm_unit=data["spm_unit"], + tax_unit=data["tax_unit"], + household=data["household"], + ), + ) + + simulation.output_dataset.save() + + def _build_simulation_from_dataset(self, microsim, dataset, system): + """Build a PolicyEngine Core simulation from dataset entity IDs. + + This follows the same pattern as policyengine-uk, initializing + entities from IDs first, then using set_input() for variables. + + Args: + microsim: The Microsimulation object to populate + dataset: The dataset containing entity data + system: The tax-benefit system + """ + import numpy as np + from policyengine_core.simulations.simulation_builder import ( + SimulationBuilder, + ) + + # Create builder and instantiate entities + builder = SimulationBuilder() + builder.populations = system.instantiate_entities() + + # Extract entity IDs from dataset + person_data = pd.DataFrame(dataset.data.person) + + # Determine column naming convention + # Support both person_X_id (from create_datasets) and X_id (from custom datasets) + household_id_col = ( + "person_household_id" + if "person_household_id" in person_data.columns + else "household_id" + ) + marital_unit_id_col = ( + "person_marital_unit_id" + if "person_marital_unit_id" in person_data.columns + else "marital_unit_id" + ) + family_id_col = ( + "person_family_id" + if "person_family_id" in person_data.columns + else "family_id" + ) + spm_unit_id_col = ( + "person_spm_unit_id" + if "person_spm_unit_id" in person_data.columns + else "spm_unit_id" + ) + tax_unit_id_col = ( + "person_tax_unit_id" + if "person_tax_unit_id" in person_data.columns + else "tax_unit_id" + ) + + # Declare entities + builder.declare_person_entity( + "person", person_data["person_id"].values + ) + builder.declare_entity( + "household", np.unique(person_data[household_id_col].values) + ) + builder.declare_entity( + "spm_unit", np.unique(person_data[spm_unit_id_col].values) + ) + builder.declare_entity( + "family", np.unique(person_data[family_id_col].values) + ) + builder.declare_entity( + "tax_unit", np.unique(person_data[tax_unit_id_col].values) + ) + builder.declare_entity( + "marital_unit", np.unique(person_data[marital_unit_id_col].values) + ) + + # Join persons to group entities + builder.join_with_persons( + builder.populations["household"], + person_data[household_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["spm_unit"], + person_data[spm_unit_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["family"], + person_data[family_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["tax_unit"], + person_data[tax_unit_id_col].values, + np.array(["member"] * len(person_data)), + ) + builder.join_with_persons( + builder.populations["marital_unit"], + person_data[marital_unit_id_col].values, + np.array(["member"] * len(person_data)), + ) + + # Build simulation from populations + microsim.build_from_populations(builder.populations) + + # Set input variables for each entity + # Skip ID columns as they're structural and already used in entity building + # Support both naming conventions + id_columns = { + "person_id", + "household_id", + "person_household_id", + "spm_unit_id", + "person_spm_unit_id", + "family_id", + "person_family_id", + "tax_unit_id", + "person_tax_unit_id", + "marital_unit_id", + "person_marital_unit_id", + } + + for entity_name, entity_df in [ + ("person", dataset.data.person), + ("household", dataset.data.household), + ("spm_unit", dataset.data.spm_unit), + ("family", dataset.data.family), + ("tax_unit", dataset.data.tax_unit), + ("marital_unit", dataset.data.marital_unit), + ]: + df = pd.DataFrame(entity_df) + for column in df.columns: + # Skip ID columns and check if variable exists in system + if column not in id_columns and column in system.variables: + microsim.set_input(column, dataset.year, df[column].values) + + +us_latest = PolicyEngineUSLatest() diff --git a/src/policyengine/tax_benefit_models/us/outputs.py b/src/policyengine/tax_benefit_models/us/outputs.py new file mode 100644 index 00000000..38e20858 --- /dev/null +++ b/src/policyengine/tax_benefit_models/us/outputs.py @@ -0,0 +1,108 @@ +"""US-specific output templates.""" + +from typing import TYPE_CHECKING + +from pydantic import ConfigDict + +from policyengine.core import Output +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) + +if TYPE_CHECKING: + from policyengine.core.simulation import Simulation + + +class ProgramStatistics(Output): + """Single program's statistics from a policy reform - represents one database row.""" + + model_config = ConfigDict(arbitrary_types_allowed=True) + + baseline_simulation: "Simulation" + reform_simulation: "Simulation" + program_name: str + entity: str + is_tax: bool = False + + # Results populated by run() + baseline_total: float | None = None + reform_total: float | None = None + change: float | None = None + baseline_count: float | None = None + reform_count: float | None = None + winners: float | None = None + losers: float | None = None + + def run(self): + """Calculate statistics for this program.""" + # Baseline totals + baseline_total = Aggregate( + simulation=self.baseline_simulation, + variable=self.program_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + baseline_total.run() + + # Reform totals + reform_total = Aggregate( + simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=AggregateType.SUM, + entity=self.entity, + ) + reform_total.run() + + # Count of recipients/payers (baseline) + baseline_count = Aggregate( + simulation=self.baseline_simulation, + variable=self.program_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.program_name, + filter_variable_geq=0.01, + ) + baseline_count.run() + + # Count of recipients/payers (reform) + reform_count = Aggregate( + simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=AggregateType.COUNT, + entity=self.entity, + filter_variable=self.program_name, + filter_variable_geq=0.01, + ) + reform_count.run() + + # Winners and losers + winners = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_geq=0.01 if not self.is_tax else -0.01, + ) + winners.run() + + losers = ChangeAggregate( + baseline_simulation=self.baseline_simulation, + reform_simulation=self.reform_simulation, + variable=self.program_name, + aggregate_type=ChangeAggregateType.COUNT, + entity=self.entity, + change_leq=-0.01 if not self.is_tax else 0.01, + ) + losers.run() + + # Populate results + self.baseline_total = float(baseline_total.result) + self.reform_total = float(reform_total.result) + self.change = float(reform_total.result - baseline_total.result) + self.baseline_count = float(baseline_count.result) + self.reform_count = float(reform_count.result) + self.winners = float(winners.result) + self.losers = float(losers.result) diff --git a/src/policyengine/utils/__init__.py b/src/policyengine/utils/__init__.py new file mode 100644 index 00000000..e73de67e --- /dev/null +++ b/src/policyengine/utils/__init__.py @@ -0,0 +1,3 @@ +from .dates import parse_safe_date as parse_safe_date +from .plotting import COLORS as COLORS +from .plotting import format_fig as format_fig diff --git a/src/policyengine/utils/charts.py b/src/policyengine/utils/charts.py deleted file mode 100644 index 0cee7048..00000000 --- a/src/policyengine/utils/charts.py +++ /dev/null @@ -1,286 +0,0 @@ -"""Chart formatting utilities for PolicyEngine.""" - -import plotly.graph_objects as go -from IPython.display import HTML - -COLOUR_SCHEMES = { - "teal": { - "primary": "#319795", - "secondary": "#38B2AC", - "tertiary": "#4FD1C5", - "light": "#81E6D9", - "lighter": "#B2F5EA", - "lightest": "#E6FFFA", - "dark": "#2C7A7B", - "darker": "#285E61", - "darkest": "#234E52", - }, - "blue": { - "primary": "#0EA5E9", - "secondary": "#0284C7", - "tertiary": "#38BDF8", - "light": "#7DD3FC", - "lighter": "#BAE6FD", - "lightest": "#E0F2FE", - "dark": "#026AA2", - "darker": "#075985", - "darkest": "#0C4A6E", - }, - "gray": { - "primary": "#6B7280", - "secondary": "#9CA3AF", - "tertiary": "#D1D5DB", - "light": "#E2E8F0", - "lighter": "#F2F4F7", - "lightest": "#F9FAFB", - "dark": "#4B5563", - "darker": "#344054", - "darkest": "#101828", - }, -} - -DEFAULT_COLOURS = [ - COLOUR_SCHEMES["teal"]["primary"], - COLOUR_SCHEMES["blue"]["primary"], - COLOUR_SCHEMES["teal"]["secondary"], - COLOUR_SCHEMES["blue"]["secondary"], - COLOUR_SCHEMES["teal"]["tertiary"], - COLOUR_SCHEMES["blue"]["tertiary"], - COLOUR_SCHEMES["gray"]["dark"], - COLOUR_SCHEMES["teal"]["dark"], -] - - -def add_fonts() -> HTML: - """Return HTML to add Google Fonts for Roboto and Roboto Mono.""" - return HTML(""" - - - - """) - - -def format_figure( - fig: go.Figure, - title: str | None = None, - x_title: str | None = None, - y_title: str | None = None, - colour_scheme: str = "teal", - show_grid: bool = True, - show_legend: bool = True, - height: int | None = None, - width: int | None = None, -) -> go.Figure: - """Apply consistent formatting to a Plotly figure. - - Args: - fig: The Plotly figure to format - title: Optional title for the chart - x_title: Optional x-axis title - y_title: Optional y-axis title - colour_scheme: Colour scheme name (teal, blue, gray) - show_grid: Whether to show gridlines - show_legend: Whether to show the legend - height: Optional figure height in pixels - width: Optional figure width in pixels - - Returns: - The formatted figure - """ - - colours = COLOUR_SCHEMES.get(colour_scheme, COLOUR_SCHEMES["teal"]) - - # Update traces with colour scheme - for i, trace in enumerate(fig.data): - if hasattr(trace, "marker"): - trace.marker.color = DEFAULT_COLOURS[i % len(DEFAULT_COLOURS)] - if hasattr(trace, "line"): - trace.line.color = DEFAULT_COLOURS[i % len(DEFAULT_COLOURS)] - trace.line.width = 2 - - # Base layout settings - layout_updates = { - "font": { - "family": "Roboto, sans-serif", - "size": 14, - "color": COLOUR_SCHEMES["gray"]["darkest"], - }, - "plot_bgcolor": "white", - "paper_bgcolor": "white", - "showlegend": show_legend, - "hovermode": "x unified", - "hoverlabel": { - "bgcolor": "white", - "font": {"family": "Roboto Mono, monospace", "size": 12}, - "bordercolor": colours["light"], - }, - } - - # Add title if provided - if title: - layout_updates["title"] = { - "text": title, - "font": { - "family": "Roboto, sans-serif", - "size": 20, - "color": COLOUR_SCHEMES["gray"]["darkest"], - "weight": 500, - }, - } - - # Configure axes - axis_config = { - "showgrid": show_grid, - "gridcolor": COLOUR_SCHEMES["gray"]["light"], - "gridwidth": 1, - "zeroline": True, - "zerolinecolor": COLOUR_SCHEMES["gray"]["lighter"], - "zerolinewidth": 1, - "tickfont": { - "family": "Roboto Mono, monospace", - "size": 11, - "color": COLOUR_SCHEMES["gray"]["primary"], - }, - "titlefont": { - "family": "Roboto, sans-serif", - "size": 14, - "color": COLOUR_SCHEMES["gray"]["dark"], - }, - "linecolor": COLOUR_SCHEMES["gray"]["light"], - "linewidth": 1, - "showline": True, - "mirror": False, - } - - layout_updates["xaxis"] = axis_config.copy() - layout_updates["yaxis"] = axis_config.copy() - - if x_title: - layout_updates["xaxis"]["title"] = x_title - if y_title: - layout_updates["yaxis"]["title"] = y_title - - layout_updates["showlegend"] = len(fig.data) > 1 and show_legend - - # Set dimensions if provided - if height: - layout_updates["height"] = height - if width: - layout_updates["width"] = width - - fig.update_layout(**layout_updates) - - fig.update_xaxes(title_font_color=COLOUR_SCHEMES["gray"]["primary"]) - fig.update_yaxes(title_font_color=COLOUR_SCHEMES["gray"]["primary"]) - - # Add text annotations to bars in bar charts - if any(isinstance(trace, go.Bar) for trace in fig.data): - for trace in fig.data: - if isinstance(trace, go.Bar): - trace.texttemplate = "%{y:,.0f}" - trace.textposition = "outside" - trace.textfont = { - "family": "Roboto Mono, monospace", - "size": 11, - "color": COLOUR_SCHEMES["gray"]["primary"], - } - - return fig - - -def create_bar_chart( - data: dict[str, list], - x: str, - y: str, - title: str | None = None, - colour_scheme: str = "teal", - **kwargs, -) -> go.Figure: - """Create a formatted bar chart. - - Args: - data: Dictionary with data for the chart - x: Column name for x-axis - y: Column name for y-axis - title: Optional chart title - colour_scheme: Colour scheme to use - **kwargs: Additional arguments for format_figure - - Returns: - Formatted bar chart figure - """ - fig = go.Figure( - data=[ - go.Bar( - x=data[x], - y=data[y], - marker_color=COLOUR_SCHEMES[colour_scheme]["primary"], - marker_line_color=COLOUR_SCHEMES[colour_scheme]["dark"], - marker_line_width=1, - hovertemplate=f"{x}: " - + "%{x}
" - + f"{y}: " - + "%{y:,.0f}", - ) - ] - ) - - return format_figure( - fig, - title=title, - x_title=x, - y_title=y, - colour_scheme=colour_scheme, - **kwargs, - ) - - -def create_line_chart( - data: dict[str, list], - x: str, - y: str | list[str], - title: str | None = None, - colour_scheme: str = "teal", - **kwargs, -) -> go.Figure: - """Create a formatted line chart. - - Args: - data: Dictionary with data for the chart - x: Column name for x-axis - y: Column name(s) for y-axis (can be a list for multiple lines) - title: Optional chart title - colour_scheme: Colour scheme to use - **kwargs: Additional arguments for format_figure - - Returns: - Formatted line chart figure - """ - traces = [] - y_columns = y if isinstance(y, list) else [y] - - for i, y_col in enumerate(y_columns): - traces.append( - go.Scatter( - x=data[x], - y=data[y_col], - mode="lines+markers", - name=y_col, - line=dict( - color=DEFAULT_COLOURS[i % len(DEFAULT_COLOURS)], width=2 - ), - marker=dict(size=6), - hovertemplate=f"{y_col}: " + "%{y:,.0f}", - ) - ) - - fig = go.Figure(data=traces) - - return format_figure( - fig, - title=title, - x_title=x, - y_title=y_columns[0] if len(y_columns) == 1 else None, - colour_scheme=colour_scheme, - **kwargs, - ) diff --git a/src/policyengine/utils/compress.py b/src/policyengine/utils/compress.py deleted file mode 100644 index 19180e2a..00000000 --- a/src/policyengine/utils/compress.py +++ /dev/null @@ -1,20 +0,0 @@ -import pickle -from typing import Any - -import blosc - - -def compress_data(data: Any) -> bytes: - """Compress data using blosc after pickling.""" - pickled_data = pickle.dumps(data) - compressed_data = blosc.compress( - pickled_data, typesize=8, cname="zstd", clevel=9, shuffle=blosc.SHUFFLE - ) - return compressed_data - - -def decompress_data(compressed_data: bytes) -> Any: - """Decompress data using blosc and then unpickle.""" - decompressed_data = blosc.decompress(compressed_data) - data = pickle.loads(decompressed_data) - return data diff --git a/src/policyengine/utils/datasets.py b/src/policyengine/utils/datasets.py deleted file mode 100644 index 02090e11..00000000 --- a/src/policyengine/utils/datasets.py +++ /dev/null @@ -1,71 +0,0 @@ -import pandas as pd - -from policyengine.models import Dataset - - -def create_uk_dataset( - dataset: str = "enhanced_frs_2023_24.h5", - year: int = 2029, -): - from policyengine_uk import Microsimulation - - from policyengine.models.policyengine_uk import policyengine_uk_model - - sim = Microsimulation( - dataset="hf://policyengine/policyengine-uk-data/" + dataset - ) - sim.default_calculation_period = year - - tables = { - "person": pd.DataFrame(sim.dataset[year].person), - "benunit": pd.DataFrame(sim.dataset[year].benunit), - "household": pd.DataFrame(sim.dataset[year].household), - } - - return Dataset( - id="uk", - name="UK", - description="A representative dataset for the UK, based on the Family Resources Survey.", - year=year, - model=policyengine_uk_model, - data=tables, - ) - - -def create_us_dataset( - dataset: str = "enhanced_cps_2024.h5", - year: int = 2024, -): - from policyengine_us import Microsimulation - - from policyengine.models.policyengine_us import policyengine_us_model - - sim = Microsimulation( - dataset="hf://policyengine/policyengine-us-data/" + dataset - ) - sim.default_calculation_period = year - - known_variables = sim.input_variables - - tables = { - "person": pd.DataFrame(), - "marital_unit": pd.DataFrame(), - "tax_unit": pd.DataFrame(), - "spm_unit": pd.DataFrame(), - "family": pd.DataFrame(), - "household": pd.DataFrame(), - } - - for variable in known_variables: - entity = sim.tax_benefit_system.variables[variable].entity.key - if variable in sim.tax_benefit_system.variables: - tables[entity][variable] = sim.calculate(variable) - - return Dataset( - id="us", - name="US", - description="A representative dataset for the US, based on the Current Population Survey.", - year=year, - model=policyengine_us_model, - data=tables, - ) diff --git a/src/policyengine/utils/dates.py b/src/policyengine/utils/dates.py new file mode 100644 index 00000000..e3c65fab --- /dev/null +++ b/src/policyengine/utils/dates.py @@ -0,0 +1,40 @@ +import calendar +from datetime import datetime + + +def parse_safe_date(date_string: str) -> datetime: + """ + Parse a YYYY-MM-DD date string and ensure the year is at least 1. + Handles invalid day values by capping to the last valid day of the month. + + Args: + date_string: Date string in YYYY-MM-DD format + + Returns: + Safe datetime object with year >= 1 + """ + try: + date_string = date_string.replace("0000-", "0001-") + date_obj = datetime.strptime(date_string, "%Y-%m-%d") + if date_obj.year < 1: + # Replace year 0 or negative years with year 1 + return date_obj.replace(year=1) + return date_obj + except ValueError as e: + # Try to handle invalid day values (e.g., 2021-06-31) + if "day is out of range for month" in str(e): + parts = date_string.split("-") + if len(parts) == 3: + year = int(parts[0]) + month = int(parts[1]) + # Get the last valid day of the month + last_day = calendar.monthrange(year, month)[1] + # Use the last valid day instead + corrected_date = f"{year:04d}-{month:02d}-{last_day:02d}" + date_obj = datetime.strptime(corrected_date, "%Y-%m-%d") + if date_obj.year < 1: + return date_obj.replace(year=1) + return date_obj + raise ValueError( + f"Invalid date format: {date_string}. Expected YYYY-MM-DD" + ) diff --git a/src/policyengine/utils/parametric_reforms.py b/src/policyengine/utils/parametric_reforms.py new file mode 100644 index 00000000..7d7a869a --- /dev/null +++ b/src/policyengine/utils/parametric_reforms.py @@ -0,0 +1,39 @@ +from collections.abc import Callable + +from policyengine_core.periods import period + +from policyengine.core import ParameterValue + + +def simulation_modifier_from_parameter_values( + parameter_values: list[ParameterValue], +) -> Callable: + """ + Create a simulation modifier function that applies the given parameter values to a simulation. + + Args: + parameter_values (list[ParameterValue]): List of ParameterValue objects to apply. + + Returns: + Callable: A function that takes a Simulation object and applies the parameter values. + """ + + def modifier(simulation): + for pv in parameter_values: + p = simulation.tax_benefit_system.parameters.get_child( + pv.parameter.name + ) + start_period = period(pv.start_date.strftime("%Y-%m-%d")) + stop_period = ( + period(pv.end_date.strftime("%Y-%m-%d")) + if pv.end_date + else None + ) + p.update( + value=pv.value, + start=start_period, + stop=stop_period, + ) + return simulation + + return modifier diff --git a/src/policyengine/utils/plotting.py b/src/policyengine/utils/plotting.py new file mode 100644 index 00000000..77aed94f --- /dev/null +++ b/src/policyengine/utils/plotting.py @@ -0,0 +1,179 @@ +"""Plotting utilities for PolicyEngine visualisations.""" + + +import plotly.graph_objects as go + +# PolicyEngine brand colours +COLORS = { + "primary": "#319795", # Teal + "primary_light": "#E6FFFA", + "primary_dark": "#1D4044", + "success": "#22C55E", # Green (positive changes) + "warning": "#FEC601", # Yellow (cautions) + "error": "#EF4444", # Red (negative changes) + "info": "#1890FF", # Blue (neutral info) + "gray_light": "#F2F4F7", + "gray": "#667085", + "gray_dark": "#101828", + "blue_secondary": "#026AA2", +} + +# Typography +FONT_FAMILY = ( + "Inter, -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif" +) +FONT_SIZE_LABEL = 12 +FONT_SIZE_DEFAULT = 14 +FONT_SIZE_TITLE = 16 + + +def format_fig( + fig: go.Figure, + title: str | None = None, + xaxis_title: str | None = None, + yaxis_title: str | None = None, + show_legend: bool = True, + height: int | None = None, + width: int | None = None, +) -> go.Figure: + """Apply PolicyEngine visual style to a plotly figure. + + Applies professional, clean styling following PolicyEngine design principles: + - Data-driven clarity prioritising immediate understanding + - Professional brand colours (teal primary, semantic colours) + - Clean typography with Inter font family + - Minimal visual clutter + - Appropriate spacing and margins + + Args: + fig: Plotly figure to format + title: Optional title to set/override + xaxis_title: Optional x-axis title to set/override + yaxis_title: Optional y-axis title to set/override + show_legend: Whether to show the legend (default: True) + height: Optional height in pixels + width: Optional width in pixels + + Returns: + Formatted plotly figure (same object, modified in place) + + Example: + >>> import plotly.graph_objects as go + >>> from policyengine.utils import format_fig + >>> fig = go.Figure(data=go.Scatter(x=[1, 2, 3], y=[4, 5, 6])) + >>> format_fig(fig, title="Example chart", xaxis_title="X", yaxis_title="Y") + """ + # Build layout updates + layout_updates = { + "font": { + "family": FONT_FAMILY, + "size": FONT_SIZE_DEFAULT, + "color": COLORS["gray_dark"], + }, + "plot_bgcolor": "#FAFAFA", + "paper_bgcolor": "white", + "margin": {"l": 100, "r": 60, "t": 100, "b": 80}, + "showlegend": show_legend, + "xaxis": { + "title": { + "font": { + "size": FONT_SIZE_DEFAULT, + "family": FONT_FAMILY, + "color": COLORS["gray_dark"], + }, + "standoff": 20, + }, + "tickfont": { + "size": FONT_SIZE_LABEL, + "family": FONT_FAMILY, + "color": COLORS["gray"], + }, + "showgrid": False, + "showline": True, + "linewidth": 2, + "linecolor": COLORS["gray_light"], + "zeroline": False, + "ticks": "outside", + "tickwidth": 1, + "tickcolor": COLORS["gray_light"], + }, + "yaxis": { + "title": { + "font": { + "size": FONT_SIZE_DEFAULT, + "family": FONT_FAMILY, + "color": COLORS["gray_dark"], + }, + "standoff": 20, + }, + "tickfont": { + "size": FONT_SIZE_LABEL, + "family": FONT_FAMILY, + "color": COLORS["gray"], + }, + "showgrid": True, + "gridwidth": 1, + "gridcolor": "#E5E7EB", + "showline": False, + "zeroline": False, + }, + "legend": { + "bgcolor": "white", + "bordercolor": COLORS["gray_light"], + "borderwidth": 1, + "font": {"size": FONT_SIZE_LABEL, "family": FONT_FAMILY}, + "orientation": "v", + "yanchor": "top", + "y": 0.99, + "xanchor": "right", + "x": 0.99, + }, + } + + # Add optional parameters + if title is not None: + layout_updates["title"] = { + "text": title, + "font": { + "size": 18, + "family": FONT_FAMILY, + "color": COLORS["gray_dark"], + "weight": 600, + }, + "x": 0, + "xanchor": "left", + "y": 0.98, + "yanchor": "top", + } + + if xaxis_title is not None: + layout_updates["xaxis"]["title"]["text"] = xaxis_title + + if yaxis_title is not None: + layout_updates["yaxis"]["title"]["text"] = yaxis_title + + if height is not None: + layout_updates["height"] = height + + if width is not None: + layout_updates["width"] = width + + # Apply layout + fig.update_layout(**layout_updates) + + # Update all traces to have cleaner styling + fig.update_traces( + marker=dict(size=8, line=dict(width=0)), + line=dict(width=3), + selector=dict(mode="markers+lines"), + ) + fig.update_traces( + marker=dict(size=8, line=dict(width=0)), + selector=dict(mode="markers"), + ) + fig.update_traces( + line=dict(width=3), + selector=dict(mode="lines"), + ) + + return fig diff --git a/tests/test_aggregate.py b/tests/test_aggregate.py new file mode 100644 index 00000000..5b4e8b27 --- /dev/null +++ b/tests/test_aggregate.py @@ -0,0 +1,492 @@ +import os +import tempfile + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.outputs.aggregate import Aggregate, AggregateType +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest, +) + + +def test_aggregate_sum(): + """Test basic sum aggregation.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.SUM, + ) + agg.run() + + assert agg.result == 140000 + + +def test_aggregate_mean(): + """Test mean aggregation.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.MEAN, + ) + agg.run() + + assert abs(agg.result - 46666.67) < 1 + + +def test_aggregate_count(): + """Test count aggregation.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.COUNT, + ) + agg.run() + + assert agg.result == 3 + + +def test_aggregate_with_entity_mapping(): + """Test aggregation with entity mapping (person var at household level).""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Aggregate person-level income at household level + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.SUM, + entity="household", + ) + agg.run() + + # Should sum across all people mapped to households + assert agg.result == 140000 + + +def test_aggregate_with_filter(): + """Test aggregation with basic filter.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [50000, 30000, 60000, 45000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Sum income for people age >= 30 + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.SUM, + filter_variable="age", + filter_variable_geq=30, + ) + agg.run() + + # Should only include people aged 30, 40, and 35 + assert agg.result == 50000 + 60000 + 45000 + + +def test_aggregate_with_quantile_filter(): + """Test aggregation with quantile-based filter.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4, 5], + "benunit_id": [1, 1, 2, 2, 3], + "household_id": [1, 1, 2, 2, 3], + "age": [20, 30, 40, 50, 60], + "employment_income": [10000, 20000, 30000, 40000, 50000], + "person_weight": [1.0, 1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2, 3], + "benunit_weight": [1.0, 1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2, 3], + "household_weight": [1.0, 1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Sum income for bottom 50% (by income) + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.SUM, + filter_variable="employment_income", + filter_variable_leq=0.5, + filter_variable_describes_quantiles=True, + ) + agg.run() + + # Should include people with income <= median (30000) + assert agg.result == 10000 + 20000 + 30000 + + +def test_aggregate_invalid_variable(): + """Test that invalid variable names raise errors during run().""" + import pytest + + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1], + "benunit_id": [1], + "household_id": [1], + "age": [30], + "employment_income": [50000], + "person_weight": [1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1], + "benunit_weight": [1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1], + "household_weight": [1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test.h5") + + dataset = PolicyEngineUKDataset( + name="Test", + description="Test dataset", + filepath=filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=uk_latest, + output_dataset=dataset, + ) + + # Invalid variable name should raise error on run() + agg = Aggregate( + simulation=simulation, + variable="nonexistent_variable", + aggregate_type=AggregateType.SUM, + ) + with pytest.raises(StopIteration): + agg.run() + + # Invalid filter variable name should raise error on run() + agg = Aggregate( + simulation=simulation, + variable="employment_income", + aggregate_type=AggregateType.SUM, + filter_variable="nonexistent_filter", + ) + with pytest.raises(StopIteration): + agg.run() diff --git a/tests/test_change_aggregate.py b/tests/test_change_aggregate.py new file mode 100644 index 00000000..ea900db6 --- /dev/null +++ b/tests/test_change_aggregate.py @@ -0,0 +1,854 @@ +import os +import tempfile + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import ( + Simulation, +) +from policyengine.outputs.change_aggregate import ( + ChangeAggregate, + ChangeAggregateType, +) +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, + uk_latest, +) + + +def test_change_aggregate_count(): + """Test counting people with any change.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [50000, 30000, 60000, 40000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: increase everyone's income by 1000 + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [51000, 31000, 61000, 41000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Count people with any change (all 4 should have changed) + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.COUNT, + ) + agg.run() + + assert agg.result == 4 + + +def test_change_aggregate_with_absolute_filter(): + """Test filtering by absolute change amount.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [50000, 30000, 60000, 40000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: different gains for different people + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [ + 52000, + 30500, + 61500, + 40200, + ], # Gains: 2000, 500, 1500, 200 + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Count people who gain at least 1000 + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=1000, + ) + agg.run() + + assert agg.result == 2 # People 1 and 3 + + +def test_change_aggregate_with_loss_filter(): + """Test filtering for losses (negative changes).""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [50000, 30000, 60000, 40000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: some people lose money + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [ + 49000, + 29000, + 60500, + 39000, + ], # Changes: -1000, -1000, 500, -1000 + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Count people who lose at least 500 (change <= -500) + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.COUNT, + change_leq=-500, + ) + agg.run() + + assert agg.result == 3 # People 1, 2, and 4 + + +def test_change_aggregate_with_relative_filter(): + """Test filtering by relative (percentage) change.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + "employment_income": [50000, 20000, 60000, 40000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: different percentage gains + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 35], + # Gains: 5000 (10%), 2000 (10%), 3000 (5%), 1000 (2.5%) + "employment_income": [55000, 22000, 63000, 41000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Count people who gain at least 8% (0.08 relative change) + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.COUNT, + relative_change_geq=0.08, + ) + agg.run() + + assert agg.result == 2 # People 1 and 2 (both 10%) + + +def test_change_aggregate_sum(): + """Test summing changes.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: everyone gains 1000 + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [51000, 31000, 61000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Sum all changes + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.SUM, + ) + agg.run() + + assert agg.result == 3000 + + +def test_change_aggregate_mean(): + """Test mean change.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: different gains + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "employment_income": [ + 51000, + 32000, + 63000, + ], # Gains: 1000, 2000, 3000 + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Mean change + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.MEAN, + ) + agg.run() + + assert agg.result == 2000 + + +def test_change_aggregate_with_filter_variable(): + """Test filtering by another variable (e.g., only adults).""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 15], # Person 4 is a child + "employment_income": [50000, 30000, 60000, 5000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: everyone gains 1000 + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "age": [30, 25, 40, 15], + "employment_income": [51000, 31000, 61000, 6000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Count adults (age >= 18) who gain money + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=1, + filter_variable="age", + filter_variable_geq=18, + ) + agg.run() + + assert agg.result == 3 # Exclude person 4 (age 15) + + +def test_change_aggregate_combined_filters(): + """Test combining multiple filter types.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4, 5], + "benunit_id": [1, 1, 2, 2, 3], + "household_id": [1, 1, 2, 2, 3], + "age": [30, 25, 40, 35, 45], + "employment_income": [50000, 20000, 60000, 40000, 80000], + "person_weight": [1.0, 1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2, 3], + "benunit_weight": [1.0, 1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2, 3], + "household_weight": [1.0, 1.0, 1.0], + } + ), + weights="household_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + baseline_filepath = os.path.join(tmpdir, "baseline.h5") + reform_filepath = os.path.join(tmpdir, "reform.h5") + + baseline_dataset = PolicyEngineUKDataset( + name="Baseline", + description="Baseline dataset", + filepath=baseline_filepath, + year=2024, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Reform: varying gains + reform_person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4, 5], + "benunit_id": [1, 1, 2, 2, 3], + "household_id": [1, 1, 2, 2, 3], + "age": [30, 25, 40, 35, 45], + # Changes: 10000 (20%), 2000 (10%), 3000 (5%), 800 (2%), 4000 (5%) + "employment_income": [60000, 22000, 63000, 40800, 84000], + "person_weight": [1.0, 1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + reform_dataset = PolicyEngineUKDataset( + name="Reform", + description="Reform dataset", + filepath=reform_filepath, + year=2024, + data=UKYearData( + person=reform_person_df, + benunit=benunit_df, + household=household_df, + ), + ) + + baseline_sim = Simulation( + dataset=baseline_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=baseline_dataset, + ) + + reform_sim = Simulation( + dataset=reform_dataset, + tax_benefit_model_version=uk_latest, + output_dataset=reform_dataset, + ) + + # Count people age >= 30 who gain at least 2000 and at least 5% relative gain + agg = ChangeAggregate( + baseline_simulation=baseline_sim, + reform_simulation=reform_sim, + variable="employment_income", + aggregate_type=ChangeAggregateType.COUNT, + change_geq=2000, + relative_change_geq=0.05, + filter_variable="age", + filter_variable_geq=30, + ) + agg.run() + + # Should include: person 1 (10000/20%, age 30), person 3 (3000/5%, age 40), person 5 (4000/5%, age 45) + # Should exclude: person 2 (age 25), person 4 (only 800 gain) + assert agg.result == 3 diff --git a/tests/test_database_models.py b/tests/test_database_models.py deleted file mode 100644 index ed6473c0..00000000 --- a/tests/test_database_models.py +++ /dev/null @@ -1,384 +0,0 @@ -"""Test database model tables for set and get operations.""" - -import sys -from datetime import datetime -from pathlib import Path - -import pytest - -# Add src to path to allow imports -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - - -# Define functions at module level to make them pickleable (not test_ prefix to avoid pytest) -def simulation_function(x): - return x * 2 - - -def policy_modifier_function(sim): - sim.set_parameter("tax_rate", 0.25) - return sim - - -def dynamic_modifier_function(sim): - sim.set_parameter("benefit_amount", 1000) - return sim - - -@pytest.fixture -def fresh_database(): - """Create a fresh database instance for each test.""" - # Import here to avoid circular imports - from policyengine.database import Database - - # Use in-memory SQLite for testing - db = Database(url="sqlite:///:memory:") - db.create_tables() - return db - - -def test_model_table_set_and_get(fresh_database): - """Test ModelTable set and get operations.""" - from policyengine.models import Model - - model = Model( - id="test_model_1", - name="Test model", - description="A test model", - simulation_function=simulation_function, - ) - - # Set the model - fresh_database.set(model) - - # Get the model - retrieved_model = fresh_database.get(Model, id="test_model_1") - - assert retrieved_model is not None - assert retrieved_model.id == "test_model_1" - assert retrieved_model.name == "Test model" - assert retrieved_model.description == "A test model" - assert retrieved_model.simulation_function(5) == 10 - - -def test_dataset_table_set_and_get(fresh_database): - """Test DatasetTable set and get operations.""" - from policyengine.models import Dataset - - test_data = {"households": [{"id": 1, "income": 50000}]} - - dataset = Dataset( - id="test_dataset_1", - name="Test dataset", - data=test_data, - ) - - fresh_database.set(dataset) - retrieved = fresh_database.get(Dataset, id="test_dataset_1") - - assert retrieved is not None - assert retrieved.id == "test_dataset_1" - assert retrieved.name == "Test dataset" - assert retrieved.data == test_data - - -def test_versioned_dataset_table_set_and_get(fresh_database): - """Test VersionedDatasetTable set and get operations.""" - from policyengine.models import VersionedDataset - - versioned_dataset = VersionedDataset( - id="test_versioned_1", - name="Test versioned dataset", - description="A test versioned dataset", - ) - - fresh_database.set(versioned_dataset) - retrieved = fresh_database.get(VersionedDataset, id="test_versioned_1") - - assert retrieved is not None - assert retrieved.id == "test_versioned_1" - assert retrieved.name == "Test versioned dataset" - assert retrieved.description == "A test versioned dataset" - - -def test_policy_table_set_and_get(fresh_database): - """Test PolicyTable set and get operations.""" - from policyengine.models import Policy - - policy = Policy( - id="test_policy_1", - name="Test policy", - description="A test policy", - simulation_modifier=policy_modifier_function, - created_at=datetime.now(), - ) - - fresh_database.set(policy) - retrieved = fresh_database.get(Policy, id="test_policy_1") - - assert retrieved is not None - assert retrieved.id == "test_policy_1" - assert retrieved.name == "Test policy" - assert retrieved.description == "A test policy" - assert callable(retrieved.simulation_modifier) - - -def test_dynamic_table_set_and_get(fresh_database): - """Test DynamicTable set and get operations.""" - from policyengine.models import Dynamic - - dynamic = Dynamic( - id="test_dynamic_1", - name="Test dynamic", - description="A test dynamic policy", - simulation_modifier=dynamic_modifier_function, - created_at=datetime.now(), - ) - - fresh_database.set(dynamic) - retrieved = fresh_database.get(Dynamic, id="test_dynamic_1") - - assert retrieved is not None - assert retrieved.id == "test_dynamic_1" - assert retrieved.name == "Test dynamic" - assert retrieved.description == "A test dynamic policy" - assert callable(retrieved.simulation_modifier) - - -def test_baseline_parameter_value_table_set_and_get(fresh_database): - """Test BaselineParameterValueTable set and get operations.""" - from policyengine.models import ( - BaselineParameterValue, - Model, - ModelVersion, - Parameter, - ) - - # Create model, parameter and model version first - model = Model( - id="bpv_model", - name="BPV model", - description="Model for baseline parameter values", - simulation_function=simulation_function, - ) - fresh_database.set(model) - - parameter = Parameter( - id="test_param_1", - description="Test parameter", - data_type=float, - model=model, - ) - fresh_database.set(parameter) - - model_version = ModelVersion( - id="test_version_1", - model=model, - version="1.0.0", - ) - fresh_database.set(model_version) - - baseline_param = BaselineParameterValue( - parameter=parameter, - model_version=model_version, - value=0.2, - start_date=datetime(2024, 1, 1), - ) - - fresh_database.set(baseline_param) - # Note: BaselineParameterValue doesn't have an id field, so we need to query differently - # For now, we'll skip retrieval test for this model - # TODO: Add proper retrieval test for composite key models - - -def test_multiple_operations_on_same_table(fresh_database): - """Test multiple set and get operations on the same table.""" - from policyengine.models import Model - - # Create multiple model instances - models = [] - for i in range(3): - model = Model( - id=f"model_{i}", - name=f"Model {i}", - description=f"Model number {i}", - simulation_function=simulation_function, - ) - models.append(model) - fresh_database.set(model) - - # Retrieve all models - for i, original in enumerate(models): - retrieved = fresh_database.get(Model, id=f"model_{i}") - assert retrieved is not None - assert retrieved.id == original.id - assert retrieved.name == original.name - assert retrieved.description == original.description - - -def test_get_nonexistent_record(fresh_database): - """Test getting a record that doesn't exist.""" - from policyengine.models import Model - - result = fresh_database.get(Model, id="nonexistent_id") - assert result is None - - -def test_complex_data_compression(fresh_database): - """Test that complex data types are properly compressed and decompressed.""" - from policyengine.models import Dataset - - # Create a dataset with complex nested structure - complex_data = { - "households": [ - { - "id": i, - "income": 30000 + i * 5000, - "members": list(range(i + 1)), - } - for i in range(100) - ], - "metadata": { - "source": "test", - "year": 2024, - "nested": {"deep": {"structure": True}}, - }, - } - - dataset = Dataset( - id="complex_dataset", - name="Complex dataset", - data=complex_data, - ) - - fresh_database.set(dataset) - retrieved = fresh_database.get(Dataset, id="complex_dataset") - - assert retrieved is not None - assert retrieved.data == complex_data - assert retrieved.data["households"][50]["income"] == 280000 - assert retrieved.data["metadata"]["nested"]["deep"]["structure"] is True - - -def test_user_table_set_and_get(fresh_database): - """Test UserTable set and get operations.""" - from policyengine.models import User - - user = User( - id="test_user_1", - username="testuser", - email="test@example.com", - first_name="Test", - last_name="User", - created_at=datetime.now(), - ) - - fresh_database.set(user) - retrieved = fresh_database.get(User, id="test_user_1") - - assert retrieved is not None - assert retrieved.id == "test_user_1" - assert retrieved.username == "testuser" - assert retrieved.email == "test@example.com" - assert retrieved.first_name == "Test" - assert retrieved.last_name == "User" - - -def test_report_table_set_and_get(fresh_database): - """Test ReportTable set and get operations.""" - from policyengine.models import Report - - report = Report( - id="test_report_1", - label="Test Report", - created_at=datetime.now(), - ) - - fresh_database.set(report) - retrieved = fresh_database.get(Report, id="test_report_1") - - assert retrieved is not None - assert retrieved.id == "test_report_1" - assert retrieved.label == "Test Report" - - -def test_report_element_table_set_and_get(fresh_database): - """Test ReportElementTable set and get operations.""" - from policyengine.models import ReportElement - - element = ReportElement( - id="test_element_1", - label="Test Element", - type="chart", - chart_type="bar", - ) - - fresh_database.set(element) - retrieved = fresh_database.get(ReportElement, id="test_element_1") - - assert retrieved is not None - assert retrieved.id == "test_element_1" - assert retrieved.label == "Test Element" - assert retrieved.type == "chart" - assert retrieved.chart_type == "bar" - - -def test_aggregate_table_set_and_get(fresh_database): - """Test AggregateTable set and get operations.""" - from policyengine.models import ( - Aggregate, - Dataset, - Model, - ModelVersion, - Simulation, - ) - - # Create model first - model = Model( - id="agg_model", - name="Agg model", - description="Model for aggregates", - simulation_function=simulation_function, - ) - fresh_database.set(model) - - # Create model version - model_version = ModelVersion( - id="agg_version_1", - model=model, - version="1.0.0", - ) - fresh_database.set(model_version) - - # Create dataset - test_data = {"households": [{"id": 1, "income": 50000}]} - dataset = Dataset( - id="agg_dataset_1", - name="Agg dataset", - data=test_data, - ) - fresh_database.set(dataset) - - # Create simulation - simulation = Simulation( - id="agg_sim_1", - dataset=dataset, - model=model, - model_version=model_version, - ) - fresh_database.set(simulation) - - aggregate = Aggregate( - simulation=simulation, - entity="household", - variable_name="household_income", - aggregate_function="sum", - year=2024, - filter_variable_name="income_positive", - ) - - fresh_database.set(aggregate) - # Note: Aggregate doesn't have an id field or direct retrieval - # We'll skip retrieval test for now - # TODO: Add proper retrieval test for Aggregate model - assert True # Placeholder assertion diff --git a/tests/test_database_simple.py b/tests/test_database_simple.py deleted file mode 100644 index bb890a14..00000000 --- a/tests/test_database_simple.py +++ /dev/null @@ -1,277 +0,0 @@ -"""Test database model tables for simple set and get operations without complex relationships.""" - -import sys -from datetime import datetime -from pathlib import Path - -import pytest - -# Add src to path to allow imports -sys.path.insert(0, str(Path(__file__).parent.parent / "src")) - - -# Define functions at module level to make them pickleable (not test_ prefix to avoid pytest) -def simulation_function(x): - return x * 2 - - -def policy_modifier_function(sim): - sim.set_parameter("tax_rate", 0.25) - return sim - - -def dynamic_modifier_function(sim): - sim.set_parameter("benefit_amount", 1000) - return sim - - -@pytest.fixture -def fresh_database(): - """Create a fresh database instance for each test.""" - # Import here to avoid circular imports - from policyengine.database import Database - - # Use in-memory SQLite for testing - db = Database(url="sqlite:///:memory:") - db.create_tables() - return db - - -def test_model_table_set_and_get(fresh_database): - """Test ModelTable set and get operations.""" - from policyengine.models import Model - - model = Model( - id="test_model_1", - name="Test model", - description="A test model", - simulation_function=simulation_function, - ) - - # Set the model - fresh_database.set(model) - - # Get the model - retrieved_model = fresh_database.get(Model, id="test_model_1") - - assert retrieved_model is not None - assert retrieved_model.id == "test_model_1" - assert retrieved_model.name == "Test model" - assert retrieved_model.description == "A test model" - assert retrieved_model.simulation_function(5) == 10 - - -def test_dataset_table_set_and_get(fresh_database): - """Test DatasetTable set and get operations.""" - from policyengine.models import Dataset - - test_data = {"households": [{"id": 1, "income": 50000}]} - - dataset = Dataset( - id="test_dataset_1", - name="Test dataset", - data=test_data, - ) - - fresh_database.set(dataset) - retrieved = fresh_database.get(Dataset, id="test_dataset_1") - - assert retrieved is not None - assert retrieved.id == "test_dataset_1" - assert retrieved.name == "Test dataset" - assert retrieved.data == test_data - - -def test_versioned_dataset_table_set_and_get(fresh_database): - """Test VersionedDatasetTable set and get operations.""" - from policyengine.models import VersionedDataset - - versioned_dataset = VersionedDataset( - id="test_versioned_1", - name="Test versioned dataset", - description="A test versioned dataset", - ) - - fresh_database.set(versioned_dataset) - retrieved = fresh_database.get(VersionedDataset, id="test_versioned_1") - - assert retrieved is not None - assert retrieved.id == "test_versioned_1" - assert retrieved.name == "Test versioned dataset" - assert retrieved.description == "A test versioned dataset" - - -def test_policy_table_set_and_get(fresh_database): - """Test PolicyTable set and get operations.""" - from policyengine.models import Policy - - policy = Policy( - id="test_policy_1", - name="Test policy", - description="A test policy", - simulation_modifier=policy_modifier_function, - created_at=datetime.now(), - ) - - fresh_database.set(policy) - retrieved = fresh_database.get(Policy, id="test_policy_1") - - assert retrieved is not None - assert retrieved.id == "test_policy_1" - assert retrieved.name == "Test policy" - assert retrieved.description == "A test policy" - assert callable(retrieved.simulation_modifier) - - -def test_dynamic_table_set_and_get(fresh_database): - """Test DynamicTable set and get operations.""" - from policyengine.models import Dynamic - - dynamic = Dynamic( - id="test_dynamic_1", - name="Test dynamic", - description="A test dynamic policy", - simulation_modifier=dynamic_modifier_function, - created_at=datetime.now(), - ) - - fresh_database.set(dynamic) - retrieved = fresh_database.get(Dynamic, id="test_dynamic_1") - - assert retrieved is not None - assert retrieved.id == "test_dynamic_1" - assert retrieved.name == "Test dynamic" - assert retrieved.description == "A test dynamic policy" - assert callable(retrieved.simulation_modifier) - - -def test_user_table_set_and_get(fresh_database): - """Test UserTable set and get operations.""" - from policyengine.models import User - - user = User( - id="test_user_1", - username="testuser", - email="test@example.com", - first_name="Test", - last_name="User", - created_at=datetime.now(), - ) - - fresh_database.set(user) - retrieved = fresh_database.get(User, id="test_user_1") - - assert retrieved is not None - assert retrieved.id == "test_user_1" - assert retrieved.username == "testuser" - assert retrieved.email == "test@example.com" - assert retrieved.first_name == "Test" - assert retrieved.last_name == "User" - - -def test_report_table_set_and_get(fresh_database): - """Test ReportTable set and get operations.""" - from policyengine.models import Report - - report = Report( - id="test_report_1", - label="Test Report", - created_at=datetime.now(), - ) - - fresh_database.set(report) - retrieved = fresh_database.get(Report, id="test_report_1") - - assert retrieved is not None - assert retrieved.id == "test_report_1" - assert retrieved.label == "Test Report" - - -def test_report_element_table_set_and_get(fresh_database): - """Test ReportElementTable set and get operations.""" - from policyengine.models import ReportElement - - element = ReportElement( - id="test_element_1", - label="Test Element", - type="chart", - chart_type="bar", - ) - - fresh_database.set(element) - retrieved = fresh_database.get(ReportElement, id="test_element_1") - - assert retrieved is not None - assert retrieved.id == "test_element_1" - assert retrieved.label == "Test Element" - assert retrieved.type == "chart" - assert retrieved.chart_type == "bar" - - -def test_multiple_operations_on_same_table(fresh_database): - """Test multiple set and get operations on the same table.""" - from policyengine.models import Model - - # Create multiple model instances - models = [] - for i in range(3): - model = Model( - id=f"model_{i}", - name=f"Model {i}", - description=f"Model number {i}", - simulation_function=simulation_function, - ) - models.append(model) - fresh_database.set(model) - - # Retrieve all models - for i, original in enumerate(models): - retrieved = fresh_database.get(Model, id=f"model_{i}") - assert retrieved is not None - assert retrieved.id == original.id - assert retrieved.name == original.name - assert retrieved.description == original.description - - -def test_get_nonexistent_record(fresh_database): - """Test getting a record that doesn't exist.""" - from policyengine.models import Model - - result = fresh_database.get(Model, id="nonexistent_id") - assert result is None - - -def test_complex_data_compression(fresh_database): - """Test that complex data types are properly compressed and decompressed.""" - from policyengine.models import Dataset - - # Create a dataset with complex nested structure - complex_data = { - "households": [ - { - "id": i, - "income": 30000 + i * 5000, - "members": list(range(i + 1)), - } - for i in range(100) - ], - "metadata": { - "source": "test", - "year": 2024, - "nested": {"deep": {"structure": True}}, - }, - } - - dataset = Dataset( - id="complex_dataset", - name="Complex dataset", - data=complex_data, - ) - - fresh_database.set(dataset) - retrieved = fresh_database.get(Dataset, id="complex_dataset") - - assert retrieved is not None - assert retrieved.data == complex_data - assert retrieved.data["households"][50]["income"] == 280000 - assert retrieved.data["metadata"]["nested"]["deep"]["structure"] is True diff --git a/tests/test_entity_mapping.py b/tests/test_entity_mapping.py new file mode 100644 index 00000000..77babd44 --- /dev/null +++ b/tests/test_entity_mapping.py @@ -0,0 +1,420 @@ +import pandas as pd +import pytest +from microdf import MicroDataFrame + +from policyengine.tax_benefit_models.uk import UKYearData + + +def test_map_same_entity(): + """Test mapping from an entity to itself returns the same data.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1, 2], "benunit_weight": [1.0, 1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + # Test person -> person + result = data.map_to_entity("person", "person") + assert isinstance(result, MicroDataFrame) + assert len(result) == 3 + assert list(result["person_id"]) == [1, 2, 3] + + # Test benunit -> benunit + result = data.map_to_entity("benunit", "benunit") + assert isinstance(result, MicroDataFrame) + assert len(result) == 2 + assert list(result["benunit_id"]) == [1, 2] + + # Test household -> household + result = data.map_to_entity("household", "household") + assert isinstance(result, MicroDataFrame) + assert len(result) == 2 + assert list(result["household_id"]) == [1, 2] + + +def test_map_person_to_benunit(): + """Test mapping person-level data to benunit level aggregates correctly.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1, 2], "benunit_weight": [1.0, 1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + result = data.map_to_entity("person", "benunit", columns=["income"]) + + # Should return a MicroDataFrame + assert isinstance(result, MicroDataFrame) + # Should have rows for each benunit (aggregated) + assert len(result) == 2 + # Should have benunit data with aggregated income + assert "benunit_id" in result.columns + assert "income" in result.columns + + # Income should be aggregated (summed) at benunit level + benunit_incomes = result.set_index("benunit_id")["income"].to_dict() + assert benunit_incomes[1] == 80000 # 50000 + 30000 + assert benunit_incomes[2] == 60000 # 60000 + + +def test_map_person_to_household(): + """Test mapping person-level data to household level aggregates correctly.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1, 2], "benunit_weight": [1.0, 1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "rent": [1000, 800], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + result = data.map_to_entity("person", "household", columns=["income"]) + + # Should have rows for each household (aggregated) + assert len(result) == 2 + # Should have household data with aggregated income + assert "household_id" in result.columns + assert "income" in result.columns + + # Income should be aggregated (summed) at household level + household_incomes = result.set_index("household_id")["income"].to_dict() + assert household_incomes[1] == 80000 # 50000 + 30000 + assert household_incomes[2] == 60000 # 60000 + + +def test_map_benunit_to_person(): + """Test mapping benunit-level data to person level expands correctly.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "total_benefit": [1000, 500], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + result = data.map_to_entity("benunit", "person", columns=["total_benefit"]) + + # Should have rows for each person + assert len(result) == 3 + # Should have benunit data merged in (expanded/replicated) + assert "benunit_id" in result.columns + assert "person_id" in result.columns + assert "total_benefit" in result.columns + + # Benefit should be replicated to all persons in benunit + person_benefits = result.set_index("person_id")["total_benefit"].to_dict() + assert person_benefits[1] == 1000 # Person 1 in benunit 1 + assert person_benefits[2] == 1000 # Person 2 in benunit 1 + assert person_benefits[3] == 500 # Person 3 in benunit 2 + + +def test_map_benunit_to_household(): + """Test mapping benunit-level data to household level aggregates via person.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 3], + "household_id": [1, 1, 2, 2], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2, 3], + "total_benefit": [1000, 500, 300], + "benunit_weight": [1.0, 1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + result = data.map_to_entity( + "benunit", "household", columns=["total_benefit"] + ) + + # Should have household data (aggregated) + assert len(result) == 2 + assert "household_id" in result.columns + assert "total_benefit" in result.columns + + # Benefits should be aggregated at household level + # Household 1 has benunit 1 (1000) + # Household 2 has benunit 2 (500) and benunit 3 (300) = 800 + household_benefits = result.set_index("household_id")[ + "total_benefit" + ].to_dict() + assert household_benefits[1] == 1000 + assert household_benefits[2] == 800 + + +def test_map_household_to_person(): + """Test mapping household-level data to person level.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1, 2], "benunit_weight": [1.0, 1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "rent": [1000, 800], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + result = data.map_to_entity("household", "person") + + # Should have rows for each person + assert len(result) == 3 + # Should have household data merged in + assert "household_id" in result.columns + assert "person_id" in result.columns + assert "rent" in result.columns + + +def test_map_household_to_benunit(): + """Test mapping household-level data to benunit level expands via person.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "benunit_id": [1, 1, 2, 2], + "household_id": [1, 1, 2, 2], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1, 2], "benunit_weight": [1.0, 1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "rent": [1000, 800], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + result = data.map_to_entity("household", "benunit", columns=["rent"]) + + # Should have benunit data (expanded from household via person) + # Since benunit-household is 1:1 in this case, should have 2 rows + assert len(result) == 2 + assert "benunit_id" in result.columns + assert "rent" in result.columns + + # Rent should be mapped from household to benunit + benunit_rents = result.set_index("benunit_id")["rent"].to_dict() + assert benunit_rents[1] == 1000 # Benunit 1 in household 1 + assert benunit_rents[2] == 800 # Benunit 2 in household 2 + + +def test_map_with_column_selection(): + """Test mapping with specific column selection.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "benunit_id": [1, 1, 2], + "household_id": [1, 1, 2], + "age": [30, 25, 40], + "income": [50000, 30000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1, 2], "benunit_weight": [1.0, 1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + # Map only age to household (aggregated) + result = data.map_to_entity("person", "household", columns=["age"]) + + assert "age" in result.columns + assert "household_id" in result.columns + # income should not be included + assert "income" not in result.columns + # Should be aggregated to household level + assert len(result) == 2 + + +def test_invalid_entity_names(): + """Test that invalid entity names raise ValueError.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1], + "benunit_id": [1], + "household_id": [1], + "person_weight": [1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame({"benunit_id": [1], "benunit_weight": [1.0]}), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), + weights="household_weight", + ) + + data = UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ) + + with pytest.raises(ValueError, match="Invalid source entity"): + data.map_to_entity("invalid", "person") + + with pytest.raises(ValueError, match="Invalid target entity"): + data.map_to_entity("person", "invalid") diff --git a/tests/test_uk_dataset.py b/tests/test_uk_dataset.py new file mode 100644 index 00000000..f8c04453 --- /dev/null +++ b/tests/test_uk_dataset.py @@ -0,0 +1,112 @@ +import os +import tempfile + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import Dataset, TaxBenefitModel +from policyengine.tax_benefit_models.uk import ( + PolicyEngineUKDataset, + UKYearData, +) + + +def test_imports(): + """Test that basic imports work.""" + # Verify classes are importable + assert PolicyEngineUKDataset is not None + assert UKYearData is not None + assert Dataset is not None + assert TaxBenefitModel is not None + + +def test_uk_latest_instantiation(): + """Test that uk_latest can be instantiated without errors.""" + from policyengine.tax_benefit_models.uk import uk_latest + + assert uk_latest is not None + assert uk_latest.version is not None + assert uk_latest.model is not None + assert uk_latest.created_at is not None + assert ( + len(uk_latest.variables) > 0 + ) # Should have variables from policyengine-uk + + +def test_save_and_load_single_year(): + """Test saving and loading a dataset with a single year.""" + # Create sample data + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "age": [25, 30, 35], + "income": [30000, 45000, 60000], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + benunit_df = MicroDataFrame( + pd.DataFrame( + { + "benunit_id": [1, 2], + "size": [2, 1], + "total_income": [75000, 60000], + "benunit_weight": [1.0, 1.0], + } + ), + weights="benunit_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1], + "num_people": [3], + "rent": [1200], + "household_weight": [1.0], + } + ), + weights="household_weight", + ) + + # Create dataset + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test_dataset.h5") + + dataset = PolicyEngineUKDataset( + name="Test Dataset", + description="A test dataset", + filepath=filepath, + year=2025, + data=UKYearData( + person=person_df, benunit=benunit_df, household=household_df + ), + ) + + # Save to file + dataset.save() + + # Load it back + loaded = PolicyEngineUKDataset( + name="Loaded Dataset", + description="Loaded from file", + filepath=filepath, + year=2025, + ) + loaded.load() + + # Verify data + assert loaded.year == 2025 + # Convert to DataFrame for comparison (MicroDataFrame inherits from DataFrame) + pd.testing.assert_frame_equal( + pd.DataFrame(loaded.data.person), pd.DataFrame(person_df) + ) + pd.testing.assert_frame_equal( + pd.DataFrame(loaded.data.benunit), pd.DataFrame(benunit_df) + ) + pd.testing.assert_frame_equal( + pd.DataFrame(loaded.data.household), pd.DataFrame(household_df) + ) diff --git a/tests/test_us_datasets.py b/tests/test_us_datasets.py new file mode 100644 index 00000000..08011610 --- /dev/null +++ b/tests/test_us_datasets.py @@ -0,0 +1,109 @@ +"""Tests for US dataset creation from HuggingFace paths.""" + +import shutil +from pathlib import Path + +import pandas as pd + +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + create_datasets, +) + + +def test_create_datasets_from_enhanced_cps(): + """Test creating datasets from enhanced CPS HuggingFace path.""" + # Clean up data directory if it exists + data_dir = Path("./data") + if data_dir.exists(): + shutil.rmtree(data_dir) + + # Create datasets for a single year to test + datasets = ["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"] + years = [2024] + + create_datasets(datasets=datasets, years=years) + + # Verify the dataset was created + dataset_file = data_dir / "enhanced_cps_2024_year_2024.h5" + assert dataset_file.exists(), f"Dataset file {dataset_file} should exist" + + # Load and verify dataset structure + dataset = PolicyEngineUSDataset( + name="test", + description="test", + filepath=str(dataset_file), + year=2024, + ) + dataset.load() + + # Check all entity types exist + assert dataset.data is not None + assert dataset.data.person is not None + assert dataset.data.household is not None + assert dataset.data.marital_unit is not None + assert dataset.data.family is not None + assert dataset.data.spm_unit is not None + assert dataset.data.tax_unit is not None + + # Check person data has required columns + person_df = pd.DataFrame(dataset.data.person) + assert "person_id" in person_df.columns + assert "person_household_id" in person_df.columns + assert "person_weight" in person_df.columns + assert len(person_df) > 0 + + # Check household data + household_df = pd.DataFrame(dataset.data.household) + assert "household_id" in household_df.columns + assert "household_weight" in household_df.columns + assert len(household_df) > 0 + + # Check all group entities have weight columns + for entity_name in [ + "marital_unit", + "family", + "spm_unit", + "tax_unit", + ]: + entity_df = pd.DataFrame(getattr(dataset.data, entity_name)) + assert f"{entity_name}_id" in entity_df.columns + assert f"{entity_name}_weight" in entity_df.columns + assert len(entity_df) > 0 + + # Clean up + shutil.rmtree(data_dir) + + +def test_create_datasets_multiple_years(): + """Test creating datasets for multiple years.""" + # Clean up data directory if it exists + data_dir = Path("./data") + if data_dir.exists(): + shutil.rmtree(data_dir) + + datasets = ["hf://policyengine/policyengine-us-data/enhanced_cps_2024.h5"] + years = [2024, 2025] + + create_datasets(datasets=datasets, years=years) + + # Verify both year datasets were created + for year in years: + dataset_file = data_dir / f"enhanced_cps_2024_year_{year}.h5" + assert dataset_file.exists(), ( + f"Dataset file for year {year} should exist" + ) + + # Load and verify + dataset = PolicyEngineUSDataset( + name=f"test-{year}", + description=f"test {year}", + filepath=str(dataset_file), + year=year, + ) + dataset.load() + assert dataset.data is not None + assert len(pd.DataFrame(dataset.data.person)) > 0 + + # Clean up + shutil.rmtree(data_dir) diff --git a/tests/test_us_entity_mapping.py b/tests/test_us_entity_mapping.py new file mode 100644 index 00000000..65fb67fb --- /dev/null +++ b/tests/test_us_entity_mapping.py @@ -0,0 +1,334 @@ +import pandas as pd +import pytest +from microdf import MicroDataFrame + +from policyengine.tax_benefit_models.us import USYearData + + +def test_map_same_entity(): + """Test mapping from an entity to itself returns the same data.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "household_id": [1, 1, 2], + "tax_unit_id": [1, 1, 2], + "age": [30, 25, 40], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame({"tax_unit_id": [1, 2], "tax_unit_weight": [1.0, 1.0]}), + weights="tax_unit_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame( + {"marital_unit_id": [1, 2], "marital_unit_weight": [1.0, 1.0]} + ), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame({"family_id": [1, 2], "family_weight": [1.0, 1.0]}), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame({"spm_unit_id": [1, 2], "spm_unit_weight": [1.0, 1.0]}), + weights="spm_unit_weight", + ) + + data = USYearData( + person=person_df, + household=household_df, + tax_unit=tax_unit_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + ) + + # Test person -> person + result = data.map_to_entity("person", "person") + assert isinstance(result, MicroDataFrame) + assert len(result) == 3 + assert list(result["person_id"]) == [1, 2, 3] + + +def test_map_person_to_household_aggregates(): + """Test mapping person-level data to household level aggregates correctly.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "household_id": [1, 1, 2, 2], + "tax_unit_id": [1, 1, 2, 2], + "income": [50000, 30000, 60000, 40000], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "rent": [1000, 800], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame({"tax_unit_id": [1, 2], "tax_unit_weight": [1.0, 1.0]}), + weights="tax_unit_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame( + {"marital_unit_id": [1, 2], "marital_unit_weight": [1.0, 1.0]} + ), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame({"family_id": [1, 2], "family_weight": [1.0, 1.0]}), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame({"spm_unit_id": [1, 2], "spm_unit_weight": [1.0, 1.0]}), + weights="spm_unit_weight", + ) + + data = USYearData( + person=person_df, + household=household_df, + tax_unit=tax_unit_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + ) + + result = data.map_to_entity("person", "household", columns=["income"]) + + # Should return household-level data + assert isinstance(result, MicroDataFrame) + assert len(result) == 2 + + # Income should be aggregated (summed) at household level + assert "income" in result.columns + household_incomes = result.set_index("household_id")["income"].to_dict() + assert household_incomes[1] == 80000 # 50000 + 30000 + assert household_incomes[2] == 100000 # 60000 + 40000 + + +def test_map_household_to_person_expands(): + """Test mapping household-level data to person level expands correctly.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3], + "household_id": [1, 1, 2], + "tax_unit_id": [1, 1, 2], + "person_weight": [1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [1, 2], + "rent": [1000, 800], + "household_weight": [1.0, 1.0], + } + ), + weights="household_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame({"tax_unit_id": [1, 2], "tax_unit_weight": [1.0, 1.0]}), + weights="tax_unit_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame( + {"marital_unit_id": [1, 2], "marital_unit_weight": [1.0, 1.0]} + ), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame({"family_id": [1, 2], "family_weight": [1.0, 1.0]}), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame({"spm_unit_id": [1, 2], "spm_unit_weight": [1.0, 1.0]}), + weights="spm_unit_weight", + ) + + data = USYearData( + person=person_df, + household=household_df, + tax_unit=tax_unit_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + ) + + result = data.map_to_entity("household", "person", columns=["rent"]) + + # Should have rows for each person + assert len(result) == 3 + # Should have household data merged in (replicated) + assert "household_id" in result.columns + assert "person_id" in result.columns + assert "rent" in result.columns + + # Rent should be replicated to all persons in household + person_rents = result.set_index("person_id")["rent"].to_dict() + assert person_rents[1] == 1000 # Person 1 in household 1 + assert person_rents[2] == 1000 # Person 2 in household 1 + assert person_rents[3] == 800 # Person 3 in household 2 + + +def test_map_tax_unit_to_household_via_person(): + """Test mapping tax_unit to household goes through person and aggregates.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1, 2, 3, 4], + "household_id": [1, 1, 2, 2], + "tax_unit_id": [1, 1, 2, 3], + "person_weight": [1.0, 1.0, 1.0, 1.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1, 2], "household_weight": [1.0, 1.0]}), + weights="household_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame( + { + "tax_unit_id": [1, 2, 3], + "taxable_income": [80000, 60000, 40000], + "tax_unit_weight": [1.0, 1.0, 1.0], + } + ), + weights="tax_unit_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame( + {"marital_unit_id": [1, 2], "marital_unit_weight": [1.0, 1.0]} + ), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame({"family_id": [1, 2], "family_weight": [1.0, 1.0]}), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame({"spm_unit_id": [1, 2], "spm_unit_weight": [1.0, 1.0]}), + weights="spm_unit_weight", + ) + + data = USYearData( + person=person_df, + household=household_df, + tax_unit=tax_unit_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + ) + + result = data.map_to_entity( + "tax_unit", "household", columns=["taxable_income"] + ) + + # Should return household-level data + assert len(result) == 2 + assert "taxable_income" in result.columns + + # Income should be aggregated at household level + # Household 1 has tax_unit 1 (80000) + # Household 2 has tax_unit 2 (60000) and tax_unit 3 (40000) = 100000 + household_incomes = result.set_index("household_id")[ + "taxable_income" + ].to_dict() + assert household_incomes[1] == 80000 + assert household_incomes[2] == 100000 + + +def test_invalid_entity_names(): + """Test that invalid entity names raise ValueError.""" + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [1], + "household_id": [1], + "tax_unit_id": [1], + "person_weight": [1.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame({"household_id": [1], "household_weight": [1.0]}), + weights="household_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame({"tax_unit_id": [1], "tax_unit_weight": [1.0]}), + weights="tax_unit_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame({"marital_unit_id": [1], "marital_unit_weight": [1.0]}), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame({"family_id": [1], "family_weight": [1.0]}), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame({"spm_unit_id": [1], "spm_unit_weight": [1.0]}), + weights="spm_unit_weight", + ) + + data = USYearData( + person=person_df, + household=household_df, + tax_unit=tax_unit_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + ) + + with pytest.raises(ValueError, match="Invalid source entity"): + data.map_to_entity("invalid", "person") + + with pytest.raises(ValueError, match="Invalid target entity"): + data.map_to_entity("person", "invalid") diff --git a/tests/test_us_simulation.py b/tests/test_us_simulation.py new file mode 100644 index 00000000..4de79691 --- /dev/null +++ b/tests/test_us_simulation.py @@ -0,0 +1,262 @@ +import os +import tempfile + +import pandas as pd +from microdf import MicroDataFrame + +from policyengine.core import Simulation +from policyengine.tax_benefit_models.us import ( + PolicyEngineUSDataset, + USYearData, + us_latest, +) + + +def test_us_latest_instantiation(): + """Test that us_latest can be instantiated without errors.""" + assert us_latest is not None + assert us_latest.version is not None + assert us_latest.model is not None + assert us_latest.created_at is not None + assert ( + len(us_latest.variables) > 0 + ) # Should have variables from policyengine-us + + +def test_save_and_load_us_dataset(): + """Test saving and loading a US dataset.""" + # Create sample data with minimal required columns + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1, 2], + "household_id": [0, 0, 1], + "marital_unit_id": [0, 0, 1], + "family_id": [0, 0, 1], + "spm_unit_id": [0, 0, 1], + "tax_unit_id": [0, 0, 1], + "age": [30, 35, 25], + "employment_income": [50000, 60000, 40000], + "person_weight": [1000.0, 1000.0, 1000.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0, 1], + "household_weight": [1000.0, 1000.0], + } + ), + weights="household_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame( + { + "marital_unit_id": [0, 1], + "marital_unit_weight": [1000.0, 1000.0], + } + ), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame( + { + "family_id": [0, 1], + "family_weight": [1000.0, 1000.0], + } + ), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame( + { + "spm_unit_id": [0, 1], + "spm_unit_weight": [1000.0, 1000.0], + } + ), + weights="spm_unit_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame( + { + "tax_unit_id": [0, 1], + "tax_unit_weight": [1000.0, 1000.0], + } + ), + weights="tax_unit_weight", + ) + + # Create dataset + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test_us_dataset.h5") + + dataset = PolicyEngineUSDataset( + name="Test US Dataset", + description="A test US dataset", + filepath=filepath, + year=2024, + data=USYearData( + person=person_df, + household=household_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + tax_unit=tax_unit_df, + ), + ) + + # Save to file + dataset.save() + + # Load it back + loaded = PolicyEngineUSDataset( + name="Loaded US Dataset", + description="Loaded from file", + filepath=filepath, + year=2024, + ) + loaded.load() + + # Verify data + assert loaded.year == 2024 + pd.testing.assert_frame_equal( + pd.DataFrame(loaded.data.person), pd.DataFrame(person_df) + ) + pd.testing.assert_frame_equal( + pd.DataFrame(loaded.data.household), pd.DataFrame(household_df) + ) + + +def test_us_simulation_from_dataset(): + """Test running a US simulation from a dataset using PolicyEngine Core pattern.""" + # Create a small test dataset + person_df = MicroDataFrame( + pd.DataFrame( + { + "person_id": [0, 1], + "household_id": [0, 0], + "marital_unit_id": [0, 0], + "family_id": [0, 0], + "spm_unit_id": [0, 0], + "tax_unit_id": [0, 0], + "age": [30, 35], + "employment_income": [50000, 60000], + "person_weight": [1000.0, 1000.0], + } + ), + weights="person_weight", + ) + + household_df = MicroDataFrame( + pd.DataFrame( + { + "household_id": [0], + "state_name": ["CA"], + "household_weight": [1000.0], + } + ), + weights="household_weight", + ) + + marital_unit_df = MicroDataFrame( + pd.DataFrame( + { + "marital_unit_id": [0], + "marital_unit_weight": [1000.0], + } + ), + weights="marital_unit_weight", + ) + + family_df = MicroDataFrame( + pd.DataFrame( + { + "family_id": [0], + "family_weight": [1000.0], + } + ), + weights="family_weight", + ) + + spm_unit_df = MicroDataFrame( + pd.DataFrame( + { + "spm_unit_id": [0], + "spm_unit_weight": [1000.0], + } + ), + weights="spm_unit_weight", + ) + + tax_unit_df = MicroDataFrame( + pd.DataFrame( + { + "tax_unit_id": [0], + "tax_unit_weight": [1000.0], + } + ), + weights="tax_unit_weight", + ) + + with tempfile.TemporaryDirectory() as tmpdir: + filepath = os.path.join(tmpdir, "test_simulation.h5") + + dataset = PolicyEngineUSDataset( + name="Test Simulation Dataset", + description="Dataset for testing simulation", + filepath=filepath, + year=2024, + data=USYearData( + person=person_df, + household=household_df, + marital_unit=marital_unit_df, + family=family_df, + spm_unit=spm_unit_df, + tax_unit=tax_unit_df, + ), + ) + + # Create and run simulation + simulation = Simulation( + dataset=dataset, + tax_benefit_model_version=us_latest, + variables={ + "person": [ + "person_id", + "person_weight", + "age", + "employment_income", + ], + "household": ["household_id", "household_weight"], + "marital_unit": ["marital_unit_id", "marital_unit_weight"], + "family": ["family_id", "family_weight"], + "spm_unit": ["spm_unit_id", "spm_unit_weight"], + "tax_unit": ["tax_unit_id", "tax_unit_weight"], + }, + ) + + simulation.run() + + # Verify output dataset was created + assert simulation.output_dataset is not None + assert simulation.output_dataset.data is not None + + # Verify person data contains the expected variables + person_output = pd.DataFrame(simulation.output_dataset.data.person) + assert "person_id" in person_output.columns + assert "age" in person_output.columns + assert "employment_income" in person_output.columns + assert len(person_output) == 2 # Should have 2 people + + # Verify employment income values match input + assert person_output["employment_income"].tolist() == [ + 50000, + 60000, + ] diff --git a/uv.lock b/uv.lock index c7bf4216..811c58e8 100644 --- a/uv.lock +++ b/uv.lock @@ -23,20 +23,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/34/d4e1c02d3bee589efb5dfa17f88ea08bdb3e3eac12bc475462aec52ed223/alabaster-0.7.16-py3-none-any.whl", hash = "sha256:b46733c07dce03ae4e150330b975c75737fa60f0a7c591b6c8bf4928a28e2c92", size = 13511, upload-time = "2024-01-10T00:56:08.388Z" }, ] -[[package]] -name = "alembic" -version = "1.16.5" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "mako" }, - { name = "sqlalchemy" }, - { name = "typing-extensions" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9a/ca/4dc52902cf3491892d464f5265a81e9dff094692c8a049a3ed6a05fe7ee8/alembic-1.16.5.tar.gz", hash = "sha256:a88bb7f6e513bd4301ecf4c7f2206fe93f9913f9b48dac3b78babde2d6fe765e", size = 1969868, upload-time = "2025-08-27T18:02:05.668Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/39/4a/4c61d4c84cfd9befb6fa08a702535b27b21fff08c946bc2f6139decbf7f7/alembic-1.16.5-py3-none-any.whl", hash = "sha256:e845dfe090c5ffa7b92593ae6687c5cb1a101e91fa53868497dbd79847f9dbe3", size = 247355, upload-time = "2025-08-27T18:02:07.37Z" }, -] - [[package]] name = "annotated-types" version = "0.7.0" @@ -137,22 +123,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/09/71/54e999902aed72baf26bca0d50781b01838251a462612966e9fc4891eadd/black-25.1.0-py3-none-any.whl", hash = "sha256:95e8176dae143ba9097f351d174fdaf0ccd29efb414b362ae3fd72bf0f710717", size = 207646, upload-time = "2025-01-29T04:15:38.082Z" }, ] -[[package]] -name = "blosc" -version = "1.11.3" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/e3/ca/3ec5a5d05e10ad200d887c8cfb9492d9a02e05f9f8f726aa178123b1711b/blosc-1.11.3.tar.gz", hash = "sha256:89ed658eba7814a92e89c44d8c524148d55921595bc133bd1a90f8888a9e088e", size = 1439627, upload-time = "2025-05-17T11:50:03.713Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/a5/23/6ee0e7270ad6299e73483dfad31b17f8acf66f7768094316a35ee0534f1d/blosc-1.11.3-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:9b474c70b9765587323dd1d7ff8e9fa9e9b35ccb3bee77e7658ce9faf2e05f7f", size = 2291576, upload-time = "2025-05-17T11:49:41.013Z" }, - { url = "https://files.pythonhosted.org/packages/51/8f/d8097dd6bf952d4bc1a31852f717d5a1157b32c1bea50dac723ed8e6bc8d/blosc-1.11.3-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:291d153864f53960861a48c2a5f6706adc2a84a2bdd9c3d1c5353d9c32748a03", size = 1801973, upload-time = "2025-05-17T11:49:42.259Z" }, - { url = "https://files.pythonhosted.org/packages/1e/cb/7fdf0756e6a38d6a28c5063bc8ba8a8c8b1a1ab6980d777c52ca7dd942b1/blosc-1.11.3-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ece67bb34741a147e4120cff3ee3784121709a112d16795716b8f4239aaddfa4", size = 2485043, upload-time = "2025-05-17T11:49:44.034Z" }, - { url = "https://files.pythonhosted.org/packages/7c/b8/d21a1305356312ca0fc6bd54ad6fb91e7434f0efef545972eb72f040c815/blosc-1.11.3-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e70216dbddb85b69a8d0f62a4a5c09b7a1fce9ca2f329793e799f8b6f9fa3ab0", size = 2619988, upload-time = "2025-05-17T11:49:45.346Z" }, - { url = "https://files.pythonhosted.org/packages/a0/79/9ed273c9493e02f0bc5deacd3854ecabd6c6ba5371ed04b6c7702fd16f77/blosc-1.11.3-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:220865ffcac638f8f0f4b51259d4e4f3236165e5b43fffd1e836cd7cd29b9367", size = 2678176, upload-time = "2025-05-17T11:49:47.12Z" }, - { url = "https://files.pythonhosted.org/packages/79/0e/c50458a1e038c0f0da70c3223d2a34ad702b86a79d0921f23a8ffaae035f/blosc-1.11.3-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:d57dde8c335378e8443757b69d0b29e90dfc53047d01311e952aecc815167dec", size = 2752740, upload-time = "2025-05-17T11:49:48.909Z" }, - { url = "https://files.pythonhosted.org/packages/f1/0e/3a5ed949e0e23eb576c08017bb39e8612607cf8f591d8149b0fb82469a03/blosc-1.11.3-cp313-cp313-win32.whl", hash = "sha256:d3d72046580a50177811916c78130d6ae7307420733de6e950cb567c896b1ca5", size = 1530991, upload-time = "2025-05-17T11:49:50.121Z" }, - { url = "https://files.pythonhosted.org/packages/06/d4/0c3cdaf34b3ef705fdab465ad8df4a3bce5bbdf2bca8f2515eae90ae28a0/blosc-1.11.3-cp313-cp313-win_amd64.whl", hash = "sha256:73721c1949f2b8d2f4168cababbfe6280511f0da9a971ba7ec9c56eab9603824", size = 1815688, upload-time = "2025-05-17T11:49:51.434Z" }, -] - [[package]] name = "blosc2" version = "3.7.2" @@ -199,24 +169,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/cb/8c/2b30c12155ad8de0cf641d76a8b396a16d2c36bc6d50b621a62b7c4567c1/build-1.3.0-py3-none-any.whl", hash = "sha256:7145f0b5061ba90a1500d60bd1b13ca0a8a4cebdd0cc16ed8adf1c0e739f43b4", size = 23382, upload-time = "2025-08-01T21:27:07.844Z" }, ] -[[package]] -name = "cachetools" -version = "5.5.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/6c/81/3747dad6b14fa2cf53fcf10548cf5aea6913e96fab41a3c198676f8948a5/cachetools-5.5.2.tar.gz", hash = "sha256:1a661caa9175d26759571b2e19580f9d6393969e5dfca11fdb1f947a23e640d4", size = 28380, upload-time = "2025-02-20T21:01:19.524Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/72/76/20fa66124dbe6be5cafeb312ece67de6b61dd91a0247d1ea13db4ebb33c2/cachetools-5.5.2-py3-none-any.whl", hash = "sha256:d26a22bcc62eb95c3beabd9f1ee5e820d3d2704fe2967cbe350e20c8ffcd3f0a", size = 10080, upload-time = "2025-02-20T21:01:16.647Z" }, -] - -[[package]] -name = "caugetch" -version = "0.0.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/a3/ec/519cb37e3e58e23a5b02a74049128f6e701ccd8892b0cebecf701fac6177/caugetch-0.0.1.tar.gz", hash = "sha256:6f6ddb3b928fa272071b02aabb3342941cd99992f27413ba8c189eb4dc3e33b0", size = 2071, upload-time = "2019-10-15T22:39:49.315Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/70/33/64fee4626ec943c2d0c4eee31c784dab8452dfe014916190730880d4ea62/caugetch-0.0.1-py3-none-any.whl", hash = "sha256:ee743dcbb513409cd24cfc42435418073683ba2f4bb7ee9f8440088a47d59277", size = 3439, upload-time = "2019-10-15T22:39:47.122Z" }, -] - [[package]] name = "certifi" version = "2025.8.3" @@ -291,15 +243,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/85/32/10bb5764d90a8eee674e9dc6f4db6a0ab47c8c4d0d83c27f7c39ac415a4d/click-8.2.1-py3-none-any.whl", hash = "sha256:61a3265b914e850b85317d0b3109c7f8cd35a670f963866005d6ef1d5175a12b", size = 102215, upload-time = "2025-05-20T23:19:47.796Z" }, ] -[[package]] -name = "clipboard" -version = "0.0.4" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyperclip" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/8a/38/17f3885713d0f39994563029942b1d31c93d4e56d80da505abfbfb3a3bc4/clipboard-0.0.4.tar.gz", hash = "sha256:a72a78e9c9bf68da1c3f29ee022417d13ec9e3824b511559fd2b702b1dd5b817", size = 1713, upload-time = "2014-05-22T12:49:08.683Z" } - [[package]] name = "colorama" version = "0.4.6" @@ -423,120 +366,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/3a/34/2b07b72bee02a63241d654f5d8af87a2de977c59638eec41ca356ab915cd/furo-2025.7.19-py3-none-any.whl", hash = "sha256:bdea869822dfd2b494ea84c0973937e35d1575af088b6721a29c7f7878adc9e3", size = 342175, upload-time = "2025-07-19T10:52:02.399Z" }, ] -[[package]] -name = "getpass4" -version = "0.0.14.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "caugetch" }, - { name = "clipboard" }, - { name = "colorama" }, - { name = "pyperclip" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/a2/f9/312f84afc384f693d02eb4ff7306a7268577a8b808aa08f0124c9abba683/getpass4-0.0.14.1.tar.gz", hash = "sha256:80aa4e3a665f2eccc6cda3ee22125eeb5c6338e91c40c4fd010b3c94c7aa4d3a", size = 5078, upload-time = "2021-11-28T17:08:47.276Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/0f/d3/ea114aba31f76418b2162e811793cde2e822c9d9ea8ca98d67f9e1f1bde6/getpass4-0.0.14.1-py3-none-any.whl", hash = "sha256:6642c11fb99db1bec90b963e863ec71cdb0b8888000f5089c6377bfbf833f8a9", size = 8683, upload-time = "2021-11-28T17:08:45.468Z" }, -] - -[[package]] -name = "google-api-core" -version = "2.25.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-auth" }, - { name = "googleapis-common-protos" }, - { name = "proto-plus" }, - { name = "protobuf" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/dc/21/e9d043e88222317afdbdb567165fdbc3b0aad90064c7e0c9eb0ad9955ad8/google_api_core-2.25.1.tar.gz", hash = "sha256:d2aaa0b13c78c61cb3f4282c464c046e45fbd75755683c9c525e6e8f7ed0a5e8", size = 165443, upload-time = "2025-06-12T20:52:20.439Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/14/4b/ead00905132820b623732b175d66354e9d3e69fcf2a5dcdab780664e7896/google_api_core-2.25.1-py3-none-any.whl", hash = "sha256:8a2a56c1fef82987a524371f99f3bd0143702fecc670c72e600c1cda6bf8dbb7", size = 160807, upload-time = "2025-06-12T20:52:19.334Z" }, -] - -[[package]] -name = "google-auth" -version = "2.40.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "cachetools" }, - { name = "pyasn1-modules" }, - { name = "rsa" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/9b/e92ef23b84fa10a64ce4831390b7a4c2e53c0132568d99d4ae61d04c8855/google_auth-2.40.3.tar.gz", hash = "sha256:500c3a29adedeb36ea9cf24b8d10858e152f2412e3ca37829b3fa18e33d63b77", size = 281029, upload-time = "2025-06-04T18:04:57.577Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/17/63/b19553b658a1692443c62bd07e5868adaa0ad746a0751ba62c59568cd45b/google_auth-2.40.3-py2.py3-none-any.whl", hash = "sha256:1370d4593e86213563547f97a92752fc658456fe4514c809544f330fed45a7ca", size = 216137, upload-time = "2025-06-04T18:04:55.573Z" }, -] - -[[package]] -name = "google-cloud-core" -version = "2.4.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core" }, - { name = "google-auth" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d6/b8/2b53838d2acd6ec6168fd284a990c76695e84c65deee79c9f3a4276f6b4f/google_cloud_core-2.4.3.tar.gz", hash = "sha256:1fab62d7102844b278fe6dead3af32408b1df3eb06f5c7e8634cbd40edc4da53", size = 35861, upload-time = "2025-03-10T21:05:38.948Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/40/86/bda7241a8da2d28a754aad2ba0f6776e35b67e37c36ae0c45d49370f1014/google_cloud_core-2.4.3-py2.py3-none-any.whl", hash = "sha256:5130f9f4c14b4fafdff75c79448f9495cfade0d8775facf1b09c3bf67e027f6e", size = 29348, upload-time = "2025-03-10T21:05:37.785Z" }, -] - -[[package]] -name = "google-cloud-storage" -version = "3.3.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-api-core" }, - { name = "google-auth" }, - { name = "google-cloud-core" }, - { name = "google-crc32c" }, - { name = "google-resumable-media" }, - { name = "requests" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/ce/0d/6be1c7e10d1e186e22990fdc22e7ece79f7c622370793cfe88aa8c658316/google_cloud_storage-3.3.1.tar.gz", hash = "sha256:60f291b0881e5c72919b156d1ee276d1b69a2538fcdc35f4e87559ae11678f77", size = 17224623, upload-time = "2025-09-01T05:59:02.804Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/80/67/68eee082fc77e718fa483893ac2463fe0ae8f28ccab334cea9dc5aba99b0/google_cloud_storage-3.3.1-py3-none-any.whl", hash = "sha256:8cace9359b85f315f21868cf771143d6dbb47dcc5a3a9317c8207accc4d10fd3", size = 275070, upload-time = "2025-09-01T05:59:00.633Z" }, -] - -[[package]] -name = "google-crc32c" -version = "1.7.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/19/ae/87802e6d9f9d69adfaedfcfd599266bf386a54d0be058b532d04c794f76d/google_crc32c-1.7.1.tar.gz", hash = "sha256:2bff2305f98846f3e825dbeec9ee406f89da7962accdb29356e4eadc251bd472", size = 14495, upload-time = "2025-03-26T14:29:13.32Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/8b/72/b8d785e9184ba6297a8620c8a37cf6e39b81a8ca01bb0796d7cbb28b3386/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:df8b38bdaf1629d62d51be8bdd04888f37c451564c2042d36e5812da9eff3c35", size = 30467, upload-time = "2025-03-26T14:36:06.909Z" }, - { url = "https://files.pythonhosted.org/packages/34/25/5f18076968212067c4e8ea95bf3b69669f9fc698476e5f5eb97d5b37999f/google_crc32c-1.7.1-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:e42e20a83a29aa2709a0cf271c7f8aefaa23b7ab52e53b322585297bb94d4638", size = 30309, upload-time = "2025-03-26T15:06:15.318Z" }, - { url = "https://files.pythonhosted.org/packages/92/83/9228fe65bf70e93e419f38bdf6c5ca5083fc6d32886ee79b450ceefd1dbd/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:905a385140bf492ac300026717af339790921f411c0dfd9aa5a9e69a08ed32eb", size = 33133, upload-time = "2025-03-26T14:41:34.388Z" }, - { url = "https://files.pythonhosted.org/packages/c3/ca/1ea2fd13ff9f8955b85e7956872fdb7050c4ace8a2306a6d177edb9cf7fe/google_crc32c-1.7.1-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6b211ddaf20f7ebeec5c333448582c224a7c90a9d98826fbab82c0ddc11348e6", size = 32773, upload-time = "2025-03-26T14:41:35.19Z" }, - { url = "https://files.pythonhosted.org/packages/89/32/a22a281806e3ef21b72db16f948cad22ec68e4bdd384139291e00ff82fe2/google_crc32c-1.7.1-cp313-cp313-win_amd64.whl", hash = "sha256:0f99eaa09a9a7e642a61e06742856eec8b19fc0037832e03f941fe7cf0c8e4db", size = 33475, upload-time = "2025-03-26T14:29:11.771Z" }, - { url = "https://files.pythonhosted.org/packages/b8/c5/002975aff514e57fc084ba155697a049b3f9b52225ec3bc0f542871dd524/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:32d1da0d74ec5634a05f53ef7df18fc646666a25efaaca9fc7dcfd4caf1d98c3", size = 33243, upload-time = "2025-03-26T14:41:35.975Z" }, - { url = "https://files.pythonhosted.org/packages/61/cb/c585282a03a0cea70fcaa1bf55d5d702d0f2351094d663ec3be1c6c67c52/google_crc32c-1.7.1-cp313-cp313t-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e10554d4abc5238823112c2ad7e4560f96c7bf3820b202660373d769d9e6e4c9", size = 32870, upload-time = "2025-03-26T14:41:37.08Z" }, -] - -[[package]] -name = "google-resumable-media" -version = "2.7.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "google-crc32c" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/58/5a/0efdc02665dca14e0837b62c8a1a93132c264bd02054a15abb2218afe0ae/google_resumable_media-2.7.2.tar.gz", hash = "sha256:5280aed4629f2b60b847b0d42f9857fd4935c11af266744df33d8074cae92fe0", size = 2163099, upload-time = "2024-08-07T22:20:38.555Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/82/35/b8d3baf8c46695858cb9d8835a53baa1eeb9906ddaf2f728a5f5b640fd1e/google_resumable_media-2.7.2-py2.py3-none-any.whl", hash = "sha256:3ce7551e9fe6d99e9a126101d2536612bb73486721951e9562fee0f90c6ababa", size = 81251, upload-time = "2024-08-07T22:20:36.409Z" }, -] - -[[package]] -name = "googleapis-common-protos" -version = "1.70.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/39/24/33db22342cf4a2ea27c9955e6713140fedd51e8b141b5ce5260897020f1a/googleapis_common_protos-1.70.0.tar.gz", hash = "sha256:0e1b44e0ea153e6594f9f394fef15193a68aaaea2d843f83e2742717ca753257", size = 145903, upload-time = "2025-04-14T10:17:02.924Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/86/f1/62a193f0227cf15a920390abe675f386dec35f7ae3ffe6da582d3ade42c7/googleapis_common_protos-1.70.0-py3-none-any.whl", hash = "sha256:b8bfcca8c25a2bb253e0e0b0adaf8c00773e5e6af6fd92397576680b807e0fd8", size = 294530, upload-time = "2025-04-14T10:17:01.271Z" }, -] - [[package]] name = "greenlet" version = "3.2.4" @@ -551,6 +380,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/ee/43/3cecdc0349359e1a527cbf2e3e28e5f8f06d3343aaf82ca13437a9aa290f/greenlet-3.2.4-cp313-cp313-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:23768528f2911bcd7e475210822ffb5254ed10d71f4028387e5a99b4c6699671", size = 610497, upload-time = "2025-08-07T13:18:31.636Z" }, { url = "https://files.pythonhosted.org/packages/b8/19/06b6cf5d604e2c382a6f31cafafd6f33d5dea706f4db7bdab184bad2b21d/greenlet-3.2.4-cp313-cp313-musllinux_1_1_aarch64.whl", hash = "sha256:00fadb3fedccc447f517ee0d3fd8fe49eae949e1cd0f6a611818f4f6fb7dc83b", size = 1121662, upload-time = "2025-08-07T13:42:41.117Z" }, { url = "https://files.pythonhosted.org/packages/a2/15/0d5e4e1a66fab130d98168fe984c509249c833c1a3c16806b90f253ce7b9/greenlet-3.2.4-cp313-cp313-musllinux_1_1_x86_64.whl", hash = "sha256:d25c5091190f2dc0eaa3f950252122edbbadbb682aa7b1ef2f8af0f8c0afefae", size = 1149210, upload-time = "2025-08-07T13:18:24.072Z" }, + { url = "https://files.pythonhosted.org/packages/1c/53/f9c440463b3057485b8594d7a638bed53ba531165ef0ca0e6c364b5cc807/greenlet-3.2.4-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:6e343822feb58ac4d0a1211bd9399de2b3a04963ddeec21530fc426cc121f19b", size = 1564759, upload-time = "2025-11-04T12:42:19.395Z" }, + { url = "https://files.pythonhosted.org/packages/47/e4/3bb4240abdd0a8d23f4f88adec746a3099f0d86bfedb623f063b2e3b4df0/greenlet-3.2.4-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:ca7f6f1f2649b89ce02f6f229d7c19f680a6238af656f61e0115b24857917929", size = 1634288, upload-time = "2025-11-04T12:42:21.174Z" }, { url = "https://files.pythonhosted.org/packages/0b/55/2321e43595e6801e105fcfdee02b34c0f996eb71e6ddffca6b10b7e1d771/greenlet-3.2.4-cp313-cp313-win_amd64.whl", hash = "sha256:554b03b6e73aaabec3745364d6239e9e012d64c68ccd0b8430c64ccc14939a8b", size = 299685, upload-time = "2025-08-07T13:24:38.824Z" }, { url = "https://files.pythonhosted.org/packages/22/5c/85273fd7cc388285632b0498dbbab97596e04b154933dfe0f3e68156c68c/greenlet-3.2.4-cp314-cp314-macosx_11_0_universal2.whl", hash = "sha256:49a30d5fda2507ae77be16479bdb62a660fa51b1eb4928b524975b3bde77b3c0", size = 273586, upload-time = "2025-08-07T13:16:08.004Z" }, { url = "https://files.pythonhosted.org/packages/d1/75/10aeeaa3da9332c2e761e4c50d4c3556c21113ee3f0afa2cf5769946f7a3/greenlet-3.2.4-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:299fd615cd8fc86267b47597123e3f43ad79c9d8a22bebdce535e53550763e2f", size = 686346, upload-time = "2025-08-07T13:42:59.944Z" }, @@ -558,6 +389,8 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/dc/8b/29aae55436521f1d6f8ff4e12fb676f3400de7fcf27fccd1d4d17fd8fecd/greenlet-3.2.4-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.whl", hash = "sha256:b4a1870c51720687af7fa3e7cda6d08d801dae660f75a76f3845b642b4da6ee1", size = 694659, upload-time = "2025-08-07T13:53:17.759Z" }, { url = "https://files.pythonhosted.org/packages/92/2e/ea25914b1ebfde93b6fc4ff46d6864564fba59024e928bdc7de475affc25/greenlet-3.2.4-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.whl", hash = "sha256:061dc4cf2c34852b052a8620d40f36324554bc192be474b9e9770e8c042fd735", size = 695355, upload-time = "2025-08-07T13:18:34.517Z" }, { url = "https://files.pythonhosted.org/packages/72/60/fc56c62046ec17f6b0d3060564562c64c862948c9d4bc8aa807cf5bd74f4/greenlet-3.2.4-cp314-cp314-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:44358b9bf66c8576a9f57a590d5f5d6e72fa4228b763d0e43fee6d3b06d3a337", size = 657512, upload-time = "2025-08-07T13:18:33.969Z" }, + { url = "https://files.pythonhosted.org/packages/23/6e/74407aed965a4ab6ddd93a7ded3180b730d281c77b765788419484cdfeef/greenlet-3.2.4-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:2917bdf657f5859fbf3386b12d68ede4cf1f04c90c3a6bc1f013dd68a22e2269", size = 1612508, upload-time = "2025-11-04T12:42:23.427Z" }, + { url = "https://files.pythonhosted.org/packages/0d/da/343cd760ab2f92bac1845ca07ee3faea9fe52bee65f7bcb19f16ad7de08b/greenlet-3.2.4-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:015d48959d4add5d6c9f6c5210ee3803a830dce46356e3bc326d6776bde54681", size = 1680760, upload-time = "2025-11-04T12:42:25.341Z" }, { url = "https://files.pythonhosted.org/packages/e3/a5/6ddab2b4c112be95601c13428db1d8b6608a8b6039816f2ba09c346c08fc/greenlet-3.2.4-cp314-cp314-win_amd64.whl", hash = "sha256:e37ab26028f12dbb0ff65f29a8d3d44a765c61e729647bf2ddfbbed621726f01", size = 303425, upload-time = "2025-08-07T13:32:27.59Z" }, ] @@ -694,22 +527,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/91/d0/274fbf7b0b12643cbbc001ce13e6a5b1607ac4929d1b11c72460152c9fc3/ipython-8.37.0-py3-none-any.whl", hash = "sha256:ed87326596b878932dbcb171e3e698845434d8c61b8d8cd474bf663041a9dcf2", size = 831864, upload-time = "2025-05-31T16:39:06.38Z" }, ] -[[package]] -name = "ipywidgets" -version = "8.1.7" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "comm" }, - { name = "ipython" }, - { name = "jupyterlab-widgets" }, - { name = "traitlets" }, - { name = "widgetsnbextension" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/3e/48/d3dbac45c2814cb73812f98dd6b38bbcc957a4e7bb31d6ea9c03bf94ed87/ipywidgets-8.1.7.tar.gz", hash = "sha256:15f1ac050b9ccbefd45dccfbb2ef6bed0029d8278682d569d71b8dd96bee0376", size = 116721, upload-time = "2025-05-05T12:42:03.489Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/58/6a/9166369a2f092bd286d24e6307de555d63616e8ddb373ebad2b5635ca4cd/ipywidgets-8.1.7-py3-none-any.whl", hash = "sha256:764f2602d25471c213919b8a1997df04bef869251db4ca8efba1b76b1bd9f7bb", size = 139806, upload-time = "2025-05-05T12:41:56.833Z" }, -] - [[package]] name = "itables" version = "2.5.2" @@ -862,15 +679,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/2f/57/6bffd4b20b88da3800c5d691e0337761576ee688eb01299eae865689d2df/jupyter_core-5.8.1-py3-none-any.whl", hash = "sha256:c28d268fc90fb53f1338ded2eb410704c5449a358406e8a948b75706e24863d0", size = 28880, upload-time = "2025-05-27T07:38:15.137Z" }, ] -[[package]] -name = "jupyterlab-widgets" -version = "3.0.15" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/b9/7d/160595ca88ee87ac6ba95d82177d29ec60aaa63821d3077babb22ce031a5/jupyterlab_widgets-3.0.15.tar.gz", hash = "sha256:2920888a0c2922351a9202817957a68c07d99673504d6cd37345299e971bb08b", size = 213149, upload-time = "2025-05-05T12:32:31.004Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/43/6a/ca128561b22b60bd5a0c4ea26649e68c8556b82bc70a0c396eebc977fe86/jupyterlab_widgets-3.0.15-py3-none-any.whl", hash = "sha256:d59023d7d7ef71400d51e6fee9a88867f6e65e10a4201605d2d7f3e8f012a31c", size = 216571, upload-time = "2025-05-05T12:32:29.534Z" }, -] - [[package]] name = "latexcodec" version = "3.0.1" @@ -892,18 +700,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/04/1e/b832de447dee8b582cac175871d2f6c3d5077cc56d5575cadba1fd1cccfa/linkify_it_py-2.0.3-py3-none-any.whl", hash = "sha256:6bcbc417b0ac14323382aef5c5192c0075bf8a9d6b41820a2b66371eac6b6d79", size = 19820, upload-time = "2024-02-04T14:48:02.496Z" }, ] -[[package]] -name = "mako" -version = "1.3.10" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markupsafe" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/9e/38/bd5b78a920a64d708fe6bc8e0a2c075e1389d53bef8413725c63ba041535/mako-1.3.10.tar.gz", hash = "sha256:99579a6f39583fa7e5630a28c3c1f440e4e97a414b80372649c0ce338da2ea28", size = 392474, upload-time = "2025-04-10T12:44:31.16Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/87/fb/99f81ac72ae23375f22b7afdb7642aba97c00a713c217124420147681a2f/mako-1.3.10-py3-none-any.whl", hash = "sha256:baef24a52fc4fc514a0887ac600f9f1cff3d82c61d4d700a1fa84d597b88db59", size = 78509, upload-time = "2025-04-10T12:50:53.297Z" }, -] - [[package]] name = "markdown-it-py" version = "3.0.0" @@ -1284,23 +1080,14 @@ wheels = [ [[package]] name = "policyengine" -version = "1.0.0" +version = "3.0.0" source = { editable = "." } dependencies = [ - { name = "alembic" }, - { name = "blosc" }, - { name = "getpass4" }, - { name = "google-cloud-storage" }, - { name = "ipywidgets" }, { name = "microdf-python" }, { name = "pandas" }, - { name = "psycopg2-binary" }, + { name = "plotly" }, { name = "pydantic" }, - { name = "pymysql" }, - { name = "rich" }, - { name = "sqlalchemy" }, - { name = "sqlmodel" }, - { name = "tqdm" }, + { name = "requests" }, ] [package.optional-dependencies] @@ -1311,6 +1098,9 @@ dev = [ { name = "furo" }, { name = "itables" }, { name = "jupyter-book" }, + { name = "policyengine-core" }, + { name = "policyengine-uk" }, + { name = "policyengine-us" }, { name = "pytest" }, { name = "pytest-asyncio" }, { name = "ruff" }, @@ -1327,33 +1117,27 @@ us = [ [package.metadata] requires-dist = [ - { name = "alembic", specifier = ">=1.13.0" }, { name = "autodoc-pydantic", marker = "extra == 'dev'" }, { name = "black", marker = "extra == 'dev'" }, - { name = "blosc", specifier = ">=1.11.3" }, { name = "build", marker = "extra == 'dev'" }, { name = "furo", marker = "extra == 'dev'" }, - { name = "getpass4" }, - { name = "google-cloud-storage", specifier = ">=2.10.0" }, - { name = "ipywidgets", specifier = ">=8.0.0" }, { name = "itables", marker = "extra == 'dev'" }, { name = "jupyter-book", marker = "extra == 'dev'" }, { name = "microdf-python" }, { name = "pandas", specifier = ">=2.0.0" }, + { name = "plotly", specifier = ">=5.0.0" }, + { name = "policyengine-core", marker = "extra == 'dev'", specifier = ">=3.10" }, { name = "policyengine-core", marker = "extra == 'uk'", specifier = ">=3.10" }, { name = "policyengine-core", marker = "extra == 'us'", specifier = ">=3.10" }, - { name = "policyengine-uk", marker = "extra == 'uk'" }, + { name = "policyengine-uk", marker = "extra == 'dev'", specifier = ">=2.51.0" }, + { name = "policyengine-uk", marker = "extra == 'uk'", specifier = ">=2.51.0" }, + { name = "policyengine-us", marker = "extra == 'dev'", specifier = ">=1.213.1" }, { name = "policyengine-us", marker = "extra == 'us'", specifier = ">=1.213.1" }, - { name = "psycopg2-binary", specifier = ">=2.9.0" }, { name = "pydantic", specifier = ">=2.0.0" }, - { name = "pymysql", specifier = ">=1.1.0" }, { name = "pytest", marker = "extra == 'dev'" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.26.0" }, - { name = "rich", specifier = ">=13.0.0" }, + { name = "requests", specifier = ">=2.31.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.5.0" }, - { name = "sqlalchemy", specifier = ">=2.0.0" }, - { name = "sqlmodel", specifier = ">=0.0.21" }, - { name = "tqdm", specifier = ">=4.67.1" }, { name = "yaml-changelog", marker = "extra == 'dev'", specifier = ">=0.1.7" }, ] provides-extras = ["uk", "us", "dev"] @@ -1387,7 +1171,7 @@ wheels = [ [[package]] name = "policyengine-uk" -version = "2.50.0" +version = "2.55.3" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "microdf-python" }, @@ -1395,9 +1179,9 @@ dependencies = [ { name = "pydantic" }, { name = "tables" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/68/52/ad3abc8265b424238a545a4e1f95e2d7e4f3511ea3fa02ad1eca53df7857/policyengine_uk-2.50.0.tar.gz", hash = "sha256:f6ec9b8abce7995b48db70c700bd5096abea60e2bac03ec1cbffaafe1a93f6a8", size = 1048546, upload-time = "2025-09-03T09:16:54.493Z" } +sdist = { url = "https://files.pythonhosted.org/packages/d3/af/d796c74d16536e072fa1cd5fb2ab85d66d9c62610db631a548d5161a6cca/policyengine_uk-2.55.3.tar.gz", hash = "sha256:28a2e3c9f63cd89bce4ddaded6861f75e6116863c9bad32731b77d0e9731e27c", size = 1051059, upload-time = "2025-10-22T10:04:54.355Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/78/9e/e2a3089864d636a7c1344b06d72219f66b02791272f86bbdfca23c511f6e/policyengine_uk-2.50.0-py3-none-any.whl", hash = "sha256:d099d14eda66ea8872a6e2a41110e8da298818dbe9e573a75ab5cdd65d85bf03", size = 1605337, upload-time = "2025-09-03T09:16:52.616Z" }, + { url = "https://files.pythonhosted.org/packages/11/bf/b64aeb51d68d3a80ee9b5e7c35e978fa1a6b16278816a8d370ce7d8623fc/policyengine_uk-2.55.3-py3-none-any.whl", hash = "sha256:13e29e10e6b45b278fb894474991ffd0560c54e1e8e45571560d026aec1dcf01", size = 1611162, upload-time = "2025-10-22T10:04:52.352Z" }, ] [[package]] @@ -1426,32 +1210,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/84/03/0d3ce49e2505ae70cf43bc5bb3033955d2fc9f932163e84dc0779cc47f48/prompt_toolkit-3.0.52-py3-none-any.whl", hash = "sha256:9aac639a3bbd33284347de5ad8d68ecc044b91a762dc39b7c21095fcd6a19955", size = 391431, upload-time = "2025-08-27T15:23:59.498Z" }, ] -[[package]] -name = "proto-plus" -version = "1.26.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "protobuf" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/f4/ac/87285f15f7cce6d4a008f33f1757fb5a13611ea8914eb58c3d0d26243468/proto_plus-1.26.1.tar.gz", hash = "sha256:21a515a4c4c0088a773899e23c7bbade3d18f9c66c73edd4c7ee3816bc96a012", size = 56142, upload-time = "2025-03-10T15:54:38.843Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4e/6d/280c4c2ce28b1593a19ad5239c8b826871fc6ec275c21afc8e1820108039/proto_plus-1.26.1-py3-none-any.whl", hash = "sha256:13285478c2dcf2abb829db158e1047e2f1e8d63a077d94263c2b88b043c75a66", size = 50163, upload-time = "2025-03-10T15:54:37.335Z" }, -] - -[[package]] -name = "protobuf" -version = "6.32.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/c0/df/fb4a8eeea482eca989b51cffd274aac2ee24e825f0bf3cbce5281fa1567b/protobuf-6.32.0.tar.gz", hash = "sha256:a81439049127067fc49ec1d36e25c6ee1d1a2b7be930675f919258d03c04e7d2", size = 440614, upload-time = "2025-08-14T21:21:25.015Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/33/18/df8c87da2e47f4f1dcc5153a81cd6bca4e429803f4069a299e236e4dd510/protobuf-6.32.0-cp310-abi3-win32.whl", hash = "sha256:84f9e3c1ff6fb0308dbacb0950d8aa90694b0d0ee68e75719cb044b7078fe741", size = 424409, upload-time = "2025-08-14T21:21:12.366Z" }, - { url = "https://files.pythonhosted.org/packages/e1/59/0a820b7310f8139bd8d5a9388e6a38e1786d179d6f33998448609296c229/protobuf-6.32.0-cp310-abi3-win_amd64.whl", hash = "sha256:a8bdbb2f009cfc22a36d031f22a625a38b615b5e19e558a7b756b3279723e68e", size = 435735, upload-time = "2025-08-14T21:21:15.046Z" }, - { url = "https://files.pythonhosted.org/packages/cc/5b/0d421533c59c789e9c9894683efac582c06246bf24bb26b753b149bd88e4/protobuf-6.32.0-cp39-abi3-macosx_10_9_universal2.whl", hash = "sha256:d52691e5bee6c860fff9a1c86ad26a13afbeb4b168cd4445c922b7e2cf85aaf0", size = 426449, upload-time = "2025-08-14T21:21:16.687Z" }, - { url = "https://files.pythonhosted.org/packages/ec/7b/607764ebe6c7a23dcee06e054fd1de3d5841b7648a90fd6def9a3bb58c5e/protobuf-6.32.0-cp39-abi3-manylinux2014_aarch64.whl", hash = "sha256:501fe6372fd1c8ea2a30b4d9be8f87955a64d6be9c88a973996cef5ef6f0abf1", size = 322869, upload-time = "2025-08-14T21:21:18.282Z" }, - { url = "https://files.pythonhosted.org/packages/40/01/2e730bd1c25392fc32e3268e02446f0d77cb51a2c3a8486b1798e34d5805/protobuf-6.32.0-cp39-abi3-manylinux2014_x86_64.whl", hash = "sha256:75a2aab2bd1aeb1f5dc7c5f33bcb11d82ea8c055c9becbb41c26a8c43fd7092c", size = 322009, upload-time = "2025-08-14T21:21:19.893Z" }, - { url = "https://files.pythonhosted.org/packages/9c/f2/80ffc4677aac1bc3519b26bc7f7f5de7fce0ee2f7e36e59e27d8beb32dd1/protobuf-6.32.0-py3-none-any.whl", hash = "sha256:ba377e5b67b908c8f3072a57b63e2c6a4cbd18aea4ed98d2584350dbf46f2783", size = 169287, upload-time = "2025-08-14T21:21:23.515Z" }, -] - [[package]] name = "psutil" version = "6.1.1" @@ -1467,25 +1225,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/7b/d7/7831438e6c3ebbfa6e01a927127a6cb42ad3ab844247f3c5b96bea25d73d/psutil-6.1.1-cp37-abi3-win_amd64.whl", hash = "sha256:f35cfccb065fff93529d2afb4a2e89e363fe63ca1e4a5da22b603a85833c2649", size = 254444, upload-time = "2024-12-19T18:22:11.335Z" }, ] -[[package]] -name = "psycopg2-binary" -version = "2.9.10" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/cb/0e/bdc8274dc0585090b4e3432267d7be4dfbfd8971c0fa59167c711105a6bf/psycopg2-binary-2.9.10.tar.gz", hash = "sha256:4b3df0e6990aa98acda57d983942eff13d824135fe2250e6522edaa782a06de2", size = 385764, upload-time = "2024-10-16T11:24:58.126Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/3e/30/d41d3ba765609c0763505d565c4d12d8f3c79793f0d0f044ff5a28bf395b/psycopg2_binary-2.9.10-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:26540d4a9a4e2b096f1ff9cce51253d0504dca5a85872c7f7be23be5a53eb18d", size = 3044699, upload-time = "2024-10-16T11:21:42.841Z" }, - { url = "https://files.pythonhosted.org/packages/35/44/257ddadec7ef04536ba71af6bc6a75ec05c5343004a7ec93006bee66c0bc/psycopg2_binary-2.9.10-cp313-cp313-macosx_14_0_arm64.whl", hash = "sha256:e217ce4d37667df0bc1c397fdcd8de5e81018ef305aed9415c3b093faaeb10fb", size = 3275245, upload-time = "2024-10-16T11:21:51.989Z" }, - { url = "https://files.pythonhosted.org/packages/1b/11/48ea1cd11de67f9efd7262085588790a95d9dfcd9b8a687d46caf7305c1a/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:245159e7ab20a71d989da00f280ca57da7641fa2cdcf71749c193cea540a74f7", size = 2851631, upload-time = "2024-10-16T11:21:57.584Z" }, - { url = "https://files.pythonhosted.org/packages/62/e0/62ce5ee650e6c86719d621a761fe4bc846ab9eff8c1f12b1ed5741bf1c9b/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:3c4ded1a24b20021ebe677b7b08ad10bf09aac197d6943bfe6fec70ac4e4690d", size = 3082140, upload-time = "2024-10-16T11:22:02.005Z" }, - { url = "https://files.pythonhosted.org/packages/27/ce/63f946c098611f7be234c0dd7cb1ad68b0b5744d34f68062bb3c5aa510c8/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3abb691ff9e57d4a93355f60d4f4c1dd2d68326c968e7db17ea96df3c023ef73", size = 3264762, upload-time = "2024-10-16T11:22:06.412Z" }, - { url = "https://files.pythonhosted.org/packages/43/25/c603cd81402e69edf7daa59b1602bd41eb9859e2824b8c0855d748366ac9/psycopg2_binary-2.9.10-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8608c078134f0b3cbd9f89b34bd60a943b23fd33cc5f065e8d5f840061bd0673", size = 3020967, upload-time = "2024-10-16T11:22:11.583Z" }, - { url = "https://files.pythonhosted.org/packages/5f/d6/8708d8c6fca531057fa170cdde8df870e8b6a9b136e82b361c65e42b841e/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:230eeae2d71594103cd5b93fd29d1ace6420d0b86f4778739cb1a5a32f607d1f", size = 2872326, upload-time = "2024-10-16T11:22:16.406Z" }, - { url = "https://files.pythonhosted.org/packages/ce/ac/5b1ea50fc08a9df82de7e1771537557f07c2632231bbab652c7e22597908/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:bb89f0a835bcfc1d42ccd5f41f04870c1b936d8507c6df12b7737febc40f0909", size = 2822712, upload-time = "2024-10-16T11:22:21.366Z" }, - { url = "https://files.pythonhosted.org/packages/c4/fc/504d4503b2abc4570fac3ca56eb8fed5e437bf9c9ef13f36b6621db8ef00/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:f0c2d907a1e102526dd2986df638343388b94c33860ff3bbe1384130828714b1", size = 2920155, upload-time = "2024-10-16T11:22:25.684Z" }, - { url = "https://files.pythonhosted.org/packages/b2/d1/323581e9273ad2c0dbd1902f3fb50c441da86e894b6e25a73c3fda32c57e/psycopg2_binary-2.9.10-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:f8157bed2f51db683f31306aa497311b560f2265998122abe1dce6428bd86567", size = 2959356, upload-time = "2024-10-16T11:22:30.562Z" }, - { url = "https://files.pythonhosted.org/packages/08/50/d13ea0a054189ae1bc21af1d85b6f8bb9bbc5572991055d70ad9006fe2d6/psycopg2_binary-2.9.10-cp313-cp313-win_amd64.whl", hash = "sha256:27422aa5f11fbcd9b18da48373eb67081243662f9b46e6fd07c3eb46e4535142", size = 2569224, upload-time = "2025-01-04T20:09:19.234Z" }, -] - [[package]] name = "ptyprocess" version = "0.7.0" @@ -1513,27 +1252,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/e0/a9/023730ba63db1e494a271cb018dcd361bd2c917ba7004c3e49d5daf795a2/py_cpuinfo-9.0.0-py3-none-any.whl", hash = "sha256:859625bc251f64e21f077d099d4162689c762b5d6a4c3c97553d56241c9674d5", size = 22335, upload-time = "2022-10-25T20:38:27.636Z" }, ] -[[package]] -name = "pyasn1" -version = "0.6.1" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/ba/e9/01f1a64245b89f039897cb0130016d79f77d52669aae6ee7b159a6c4c018/pyasn1-0.6.1.tar.gz", hash = "sha256:6f580d2bdd84365380830acf45550f2511469f673cb4a5ae3857a3170128b034", size = 145322, upload-time = "2024-09-10T22:41:42.55Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/c8/f1/d6a797abb14f6283c0ddff96bbdd46937f64122b8c925cab503dd37f8214/pyasn1-0.6.1-py3-none-any.whl", hash = "sha256:0d632f46f2ba09143da3a8afe9e33fb6f92fa2320ab7e886e2d0f7672af84629", size = 83135, upload-time = "2024-09-11T16:00:36.122Z" }, -] - -[[package]] -name = "pyasn1-modules" -version = "0.4.2" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/e9/e6/78ebbb10a8c8e4b61a59249394a4a594c1a7af95593dc933a349c8d00964/pyasn1_modules-0.4.2.tar.gz", hash = "sha256:677091de870a80aae844b1ca6134f54652fa2c8c5a52aa396440ac3106e941e6", size = 307892, upload-time = "2025-03-28T02:41:22.17Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/47/8d/d529b5d697919ba8c11ad626e835d4039be708a35b0d22de83a269a6682c/pyasn1_modules-0.4.2-py3-none-any.whl", hash = "sha256:29253a9207ce32b64c3ac6600edc75368f98473906e8fd1043bd6b5b1de2c14a", size = 181259, upload-time = "2025-03-28T02:41:19.028Z" }, -] - [[package]] name = "pybtex" version = "0.25.1" @@ -1654,21 +1372,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/c7/21/705964c7812476f378728bdf590ca4b771ec72385c533964653c68e86bdc/pygments-2.19.2-py3-none-any.whl", hash = "sha256:86540386c03d588bb81d44bc3928634ff26449851e99741617ecb9037ee5ec0b", size = 1225217, upload-time = "2025-06-21T13:39:07.939Z" }, ] -[[package]] -name = "pymysql" -version = "1.1.2" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/f5/ae/1fe3fcd9f959efa0ebe200b8de88b5a5ce3e767e38c7ac32fb179f16a388/pymysql-1.1.2.tar.gz", hash = "sha256:4961d3e165614ae65014e361811a724e2044ad3ea3739de9903ae7c21f539f03", size = 48258, upload-time = "2025-08-24T12:55:55.146Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/7c/4c/ad33b92b9864cbde84f259d5df035a6447f91891f5be77788e2a3892bce3/pymysql-1.1.2-py3-none-any.whl", hash = "sha256:e6b1d89711dd51f8f74b1631fe08f039e7d76cf67a42a323d3178f0f25762ed9", size = 45300, upload-time = "2025-08-24T12:55:53.394Z" }, -] - -[[package]] -name = "pyperclip" -version = "1.9.0" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/30/23/2f0a3efc4d6a32f3b63cdff36cd398d9701d26cda58e3ab97ac79fb5e60d/pyperclip-1.9.0.tar.gz", hash = "sha256:b7de0142ddc81bfc5c7507eea19da920b92252b548b96186caf94a5e2527d310", size = 20961, upload-time = "2024-06-18T20:38:48.401Z" } - [[package]] name = "pyproject-hooks" version = "1.2.0" @@ -1851,19 +1554,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/1e/db/4254e3eabe8020b458f1a747140d32277ec7a271daf1d235b70dc0b4e6e3/requests-2.32.5-py3-none-any.whl", hash = "sha256:2462f94637a34fd532264295e186976db0f5d453d1cdd31473c85a6a161affb6", size = 64738, upload-time = "2025-08-18T20:46:00.542Z" }, ] -[[package]] -name = "rich" -version = "14.1.0" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "markdown-it-py" }, - { name = "pygments" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/fe/75/af448d8e52bf1d8fa6a9d089ca6c07ff4453d86c65c145d0a300bb073b9b/rich-14.1.0.tar.gz", hash = "sha256:e497a48b844b0320d45007cdebfeaeed8db2a4f4bcf49f15e455cfc4af11eaa8", size = 224441, upload-time = "2025-07-25T07:32:58.125Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/e3/30/3c4d035596d3cf444529e0b2953ad0466f6049528a879d27534700580395/rich-14.1.0-py3-none-any.whl", hash = "sha256:536f5f1785986d6dbdea3c75205c473f970777b4a0d6c6dd1b696aa05a3fa04f", size = 243368, upload-time = "2025-07-25T07:32:56.73Z" }, -] - [[package]] name = "rpds-py" version = "0.27.1" @@ -1930,18 +1620,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/32/7d/97119da51cb1dd3f2f3c0805f155a3aa4a95fa44fe7d78ae15e69edf4f34/rpds_py-0.27.1-cp314-cp314t-win_amd64.whl", hash = "sha256:6567d2bb951e21232c2f660c24cf3470bb96de56cdcb3f071a83feeaff8a2772", size = 230097, upload-time = "2025-08-27T12:15:03.961Z" }, ] -[[package]] -name = "rsa" -version = "4.9.1" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pyasn1" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/da/8a/22b7beea3ee0d44b1916c0c1cb0ee3af23b700b6da9f04991899d0c555d4/rsa-4.9.1.tar.gz", hash = "sha256:e7bdbfdb5497da4c07dfd35530e1a902659db6ff241e39d9953cad06ebd0ae75", size = 29034, upload-time = "2025-04-16T09:51:18.218Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/64/8d/0133e4eb4beed9e425d9a98ed6e081a55d195481b7632472be1af08d2f6b/rsa-4.9.1-py3-none-any.whl", hash = "sha256:68635866661c6836b8d39430f97a996acbd61bfa49406748ea243539fe239762", size = 34696, upload-time = "2025-04-16T09:51:17.142Z" }, -] - [[package]] name = "ruff" version = "0.12.11" @@ -2257,19 +1935,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/b8/d9/13bdde6521f322861fab67473cec4b1cc8999f3871953531cf61945fad92/sqlalchemy-2.0.43-py3-none-any.whl", hash = "sha256:1681c21dd2ccee222c2fe0bef671d1aef7c504087c9c4e800371cfcc8ac966fc", size = 1924759, upload-time = "2025-08-11T15:39:53.024Z" }, ] -[[package]] -name = "sqlmodel" -version = "0.0.24" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "pydantic" }, - { name = "sqlalchemy" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/86/4b/c2ad0496f5bdc6073d9b4cef52be9c04f2b37a5773441cc6600b1857648b/sqlmodel-0.0.24.tar.gz", hash = "sha256:cc5c7613c1a5533c9c7867e1aab2fd489a76c9e8a061984da11b4e613c182423", size = 116780, upload-time = "2025-03-07T05:43:32.887Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/16/91/484cd2d05569892b7fef7f5ceab3bc89fb0f8a8c0cde1030d383dbc5449c/sqlmodel-0.0.24-py3-none-any.whl", hash = "sha256:6778852f09370908985b667d6a3ab92910d0d5ec88adcaf23dbc242715ff7193", size = 28622, upload-time = "2025-03-07T05:43:30.37Z" }, -] - [[package]] name = "stack-data" version = "0.6.3" @@ -2438,15 +2103,6 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/0b/2c/87f3254fd8ffd29e4c02732eee68a83a1d3c346ae39bc6822dcbcb697f2b/wheel-0.45.1-py3-none-any.whl", hash = "sha256:708e7481cc80179af0e556bbf0cc00b8444c7321e2700b8d8580231d13017248", size = 72494, upload-time = "2024-11-23T00:18:21.207Z" }, ] -[[package]] -name = "widgetsnbextension" -version = "4.0.14" -source = { registry = "https://pypi.org/simple" } -sdist = { url = "https://files.pythonhosted.org/packages/41/53/2e0253c5efd69c9656b1843892052a31c36d37ad42812b5da45c62191f7e/widgetsnbextension-4.0.14.tar.gz", hash = "sha256:a3629b04e3edb893212df862038c7232f62973373869db5084aed739b437b5af", size = 1097428, upload-time = "2025-04-10T13:01:25.628Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/ca/51/5447876806d1088a0f8f71e16542bf350918128d0a69437df26047c8e46f/widgetsnbextension-4.0.14-py3-none-any.whl", hash = "sha256:4875a9eaf72fbf5079dc372a51a9f268fc38d46f767cbf85c43a36da5cb9b575", size = 2196503, upload-time = "2025-04-10T13:01:23.086Z" }, -] - [[package]] name = "yaml-changelog" version = "0.3.0"