policyengine-uk-data/policyengine_uk_data/datasets/create_datasets.py at 0edba1ddde4baa3317b39c22681f9a2fc7b0bd2f · PolicyEngine/policyengine-uk-data · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
from policyengine_uk_data.datasets.frs import create_frs
from policyengine_uk_data.storage import STORAGE_FOLDER
import gc
import logging
import os
import io
import numpy as np
import h5py
from policyengine_uk_data.utils.uprating import uprate_dataset
from policyengine_uk_data.utils.progress import (
    ProcessingProgress,
    display_success_panel,
    display_error_panel,
)

logging.basicConfig(level=logging.INFO)

USE_MODAL = os.environ.get("MODAL_CALIBRATE", "0") == "1"


def _dump(arr) -> bytes:
    buf = io.BytesIO()
    np.save(buf, arr)
    return buf.getvalue()


def _build_weights_init(dataset, area_count, r):
    areas_per_household = np.maximum(r.sum(axis=0), 1)
    original_weights = np.log(
        dataset.household.household_weight.values / areas_per_household
        + np.random.random(len(dataset.household.household_weight.values))
        * 0.01
    )
    return np.ones((area_count, len(original_weights))) * original_weights


def _build_log(checkpoints, get_performance, m_c, y_c, m_n, y_n, log_csv):
    import pandas as pd

    performance = pd.DataFrame()
    for epoch, w_bytes in checkpoints:
        w = np.load(io.BytesIO(w_bytes))
        perf = get_performance(w, m_c, y_c, m_n, y_n, [])
        perf["epoch"] = epoch
        perf["loss"] = perf.rel_abs_error**2
        perf["target_name"] = [
            f"{a}/{m}" for a, m in zip(perf.name, perf.metric)
        ]
        performance = pd.concat([performance, perf], ignore_index=True)
    performance.to_csv(log_csv, index=False)
    final_epoch, final_bytes = checkpoints[-1]
    return np.load(io.BytesIO(final_bytes))


def _run_modal_calibrations(
    frs,
    epochs,
    create_constituency_target_matrix,
    create_local_authority_target_matrix,
    create_national_target_matrix,
    get_constituency_performance,
    get_la_performance,
):
    """
    Dispatch both calibrations concurrently to Modal GPU containers.
    Returns (constituency_weights, la_weights) as numpy arrays and
    writes constituency_calibration_log.csv / la_calibration_log.csv.
    """
    import modal
    import pandas as pd
    from policyengine_uk_data.utils.modal_calibrate import (
        app,
        run_calibration,
    )

    def _arr(x):
        return x.values if hasattr(x, "values") else x

    # Build matrices one at a time; serialise immediately and free the
    # DataFrames (keeping only column/index metadata for log reconstruction).

    m_nat, y_nat = create_national_target_matrix(frs.copy())
    m_nat_np = _arr(m_nat)
    y_nat_np = _arr(y_nat)
    m_nat_cols = list(m_nat.columns)
    y_nat_index = list(y_nat.index)
    b_m_nat = _dump(m_nat_np)
    b_y_nat = _dump(y_nat_np)
    del m_nat, y_nat
    gc.collect()

    frs_copy = frs.copy()
    matrix_c, y_c, r_c = create_constituency_target_matrix(frs_copy)
    matrix_c_np = _arr(matrix_c)
    y_c_np = _arr(y_c)
    matrix_c_cols = list(matrix_c.columns)
    y_c_cols = list(y_c.columns)
    wi_c = _build_weights_init(frs_copy, 650, r_c)
    b_matrix_c = _dump(matrix_c_np)
    b_y_c = _dump(y_c_np)
    b_wi_c = _dump(wi_c)
    b_r_c = _dump(r_c)
    del matrix_c, y_c, wi_c, r_c, frs_copy
    gc.collect()

    frs_copy = frs.copy()
    matrix_la, y_la, r_la = create_local_authority_target_matrix(frs_copy)
    matrix_la_np = _arr(matrix_la)
    y_la_np = _arr(y_la)
    matrix_la_cols = list(matrix_la.columns)
    y_la_cols = list(y_la.columns)
    wi_la = _build_weights_init(frs_copy, 360, r_la)
    b_matrix_la = _dump(matrix_la_np)
    b_y_la = _dump(y_la_np)
    b_wi_la = _dump(wi_la)
    b_r_la = _dump(r_la)
    del matrix_la, y_la, wi_la, r_la, frs_copy
    gc.collect()

    with modal.enable_output(), app.run():
        fut_c = run_calibration.spawn(
            b_matrix_c, b_y_c, b_r_c, b_m_nat, b_y_nat, b_wi_c, epochs
        )
        fut_la = run_calibration.spawn(
            b_matrix_la, b_y_la, b_r_la, b_m_nat, b_y_nat, b_wi_la, epochs
        )
        del b_r_c, b_wi_c, b_r_la, b_wi_la
        gc.collect()

        checkpoints_c = fut_c.get()
        checkpoints_la = fut_la.get()

    # Reconstruct DataFrames with correct labels for get_performance
    matrix_c_df = pd.DataFrame(matrix_c_np, columns=matrix_c_cols)
    y_c_df = pd.DataFrame(y_c_np, columns=y_c_cols)
    m_nat_df = pd.DataFrame(m_nat_np, columns=m_nat_cols)
    y_nat_df = pd.Series(y_nat_np, index=y_nat_index)
    matrix_la_df = pd.DataFrame(matrix_la_np, columns=matrix_la_cols)
    y_la_df = pd.DataFrame(y_la_np, columns=y_la_cols)

    weights_c = _build_log(
        checkpoints_c,
        get_constituency_performance,
        matrix_c_df,
        y_c_df,
        m_nat_df,
        y_nat_df,
        "constituency_calibration_log.csv",
    )
    weights_la = _build_log(
        checkpoints_la,
        get_la_performance,
        matrix_la_df,
        y_la_df,
        m_nat_df,
        y_nat_df,
        "la_calibration_log.csv",
    )

    return weights_c, weights_la


def main():
    """Create enhanced FRS dataset with rich progress tracking."""
    try:
        is_testing = os.environ.get("TESTING", "0") == "1"
        epochs = 32 if is_testing else 512

        progress_tracker = ProcessingProgress()

        steps = [
            "Create base FRS dataset",
            "Impute consumption",
            "Impute wealth",
            "Impute VAT",
            "Impute public service usage",
            "Impute income",
            "Impute capital gains",
            "Impute salary sacrifice",
            "Impute student loan plan",
            "Uprate to 2025",
            "Calibrate constituency weights",
            "Calibrate local authority weights",
            "Downrate to 2023",
            "Save final dataset",
        ]

        with progress_tracker.track_dataset_creation(steps) as (
            update_dataset,
            nested_progress,
        ):
            update_dataset("Create base FRS dataset", "processing")
            frs = create_frs(
                raw_frs_folder=STORAGE_FOLDER / "frs_2023_24",
                year=2023,
            )
            frs.save(STORAGE_FOLDER / "frs_2023_24.h5")
            update_dataset("Create base FRS dataset", "completed")

            from policyengine_uk_data.datasets.imputations import (
                impute_consumption,
                impute_wealth,
                impute_vat,
                impute_income,
                impute_capital_gains,
                impute_services,
                impute_salary_sacrifice,
                impute_student_loan_plan,
            )

            update_dataset("Impute wealth", "processing")
            frs = impute_wealth(frs)
            update_dataset("Impute wealth", "completed")

            update_dataset("Impute consumption", "processing")
            frs = impute_consumption(frs)
            update_dataset("Impute consumption", "completed")

            update_dataset("Impute VAT", "processing")
            frs = impute_vat(frs)
            update_dataset("Impute VAT", "completed")

            update_dataset("Impute public service usage", "processing")
            frs = impute_services(frs)
            update_dataset("Impute public service usage", "completed")

            update_dataset("Impute income", "processing")
            frs = impute_income(frs)
            update_dataset("Impute income", "completed")

            update_dataset("Impute capital gains", "processing")
            frs = impute_capital_gains(frs)
            update_dataset("Impute capital gains", "completed")

            update_dataset("Impute salary sacrifice", "processing")
            frs = impute_salary_sacrifice(frs)
            update_dataset("Impute salary sacrifice", "completed")

            update_dataset("Impute student loan plan", "processing")
            frs = impute_student_loan_plan(frs, year=2025)
            update_dataset("Impute student loan plan", "completed")

            update_dataset("Uprate to 2025", "processing")
            frs = uprate_dataset(frs, 2025)
            update_dataset("Uprate to 2025", "completed")

            from policyengine_uk_data.datasets.local_areas.constituencies.loss import (
                create_constituency_target_matrix,
            )
            from policyengine_uk_data.targets.build_loss_matrix import (
                create_target_matrix as create_national_target_matrix,
            )
            from policyengine_uk_data.datasets.local_areas.constituencies.calibrate import (
                get_performance,
            )
            from policyengine_uk_data.datasets.local_areas.local_authorities.calibrate import (
                get_performance as get_la_performance,
            )
            from policyengine_uk_data.datasets.local_areas.local_authorities.loss import (
                create_local_authority_target_matrix,
            )

            if USE_MODAL:
                update_dataset("Calibrate constituency weights", "processing")
                update_dataset(
                    "Calibrate local authority weights", "processing"
                )

                weights_c, weights_la = _run_modal_calibrations(
                    frs,
                    epochs,
                    create_constituency_target_matrix,
                    create_local_authority_target_matrix,
                    create_national_target_matrix,
                    get_performance,
                    get_la_performance,
                )

                with h5py.File(
                    STORAGE_FOLDER / "parliamentary_constituency_weights.h5",
                    "w",
                ) as f:
                    f.create_dataset("2025", data=weights_c)

                with h5py.File(
                    STORAGE_FOLDER / "local_authority_weights.h5", "w"
                ) as f:
                    f.create_dataset("2025", data=weights_la)

                frs_calibrated_constituencies = frs.copy()
                frs_calibrated_constituencies.household.household_weight = (
                    weights_c.sum(axis=0)
                )

                update_dataset("Calibrate constituency weights", "completed")
                update_dataset(
                    "Calibrate local authority weights", "completed"
                )
            else:
                from policyengine_uk_data.utils.calibrate import (
                    calibrate_local_areas,
                )

                update_dataset("Calibrate constituency weights", "processing")
                frs_calibrated_constituencies = calibrate_local_areas(
                    dataset=frs,
                    epochs=epochs,
                    matrix_fn=create_constituency_target_matrix,
                    national_matrix_fn=create_national_target_matrix,
                    area_count=650,
                    weight_file="parliamentary_constituency_weights.h5",
                    excluded_training_targets=[],
                    log_csv="constituency_calibration_log.csv",
                    verbose=True,
                    area_name="Constituency",
                    get_performance=get_performance,
                    nested_progress=nested_progress,
                )
                update_dataset("Calibrate constituency weights", "completed")

                update_dataset(
                    "Calibrate local authority weights", "processing"
                )
                calibrate_local_areas(
                    dataset=frs,
                    epochs=epochs,
                    matrix_fn=create_local_authority_target_matrix,
                    national_matrix_fn=create_national_target_matrix,
                    area_count=360,
                    weight_file="local_authority_weights.h5",
                    excluded_training_targets=[],
                    log_csv="la_calibration_log.csv",
                    verbose=True,
                    area_name="Local Authority",
                    get_performance=get_la_performance,
                    nested_progress=nested_progress,
                )
                update_dataset(
                    "Calibrate local authority weights", "completed"
                )

            update_dataset("Downrate to 2023", "processing")
            frs_calibrated = uprate_dataset(
                frs_calibrated_constituencies, 2023
            )
            update_dataset("Downrate to 2023", "completed")

            update_dataset("Save final dataset", "processing")
            frs_calibrated.save(STORAGE_FOLDER / "enhanced_frs_2023_24.h5")
            update_dataset("Save final dataset", "completed")

        display_success_panel(
            "Dataset creation completed successfully",
            details={
                "base_dataset": "frs_2023_24.h5",
                "enhanced_dataset": "enhanced_frs_2023_24.h5",
                "imputations_applied": "consumption, wealth, VAT, services, income, capital_gains, salary_sacrifice, student_loan_plan",
                "calibration": "national, LA and constituency targets",
                "calibration_backend": "Modal GPU" if USE_MODAL else "CPU",
            },
        )

    except Exception as e:
        display_error_panel(
            f"Dataset creation failed: {str(e)}",
            suggestions=[
                "Check that all required data files are present in storage folder",
                "Verify sufficient disk space for dataset creation",
                "Review log files for detailed error information",
            ],
        )
        raise


if __name__ == "__main__":
    main()