Skip to content

Commit 42b82e3

Browse files
committed
Reduce peak memory by serialising matrices before building next one
Free each target matrix DataFrame immediately after serialising to bytes, keeping only column metadata for post-Modal log reconstruction. This prevents three Microsimulation objects' data from sitting in memory simultaneously while building national + constituency + LA matrices.
1 parent 448e621 commit 42b82e3

File tree

1 file changed

+59
-37
lines changed

1 file changed

+59
-37
lines changed

policyengine_uk_data/datasets/create_datasets.py

Lines changed: 59 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ def _run_modal_calibrations(
6666
Returns (constituency_weights, la_weights) as numpy arrays and
6767
writes constituency_calibration_log.csv / la_calibration_log.csv.
6868
"""
69+
import pandas as pd
6970
from policyengine_uk_data.utils.modal_calibrate import (
7071
app,
7172
run_calibration,
@@ -74,63 +75,84 @@ def _run_modal_calibrations(
7475
def _arr(x):
7576
return x.values if hasattr(x, "values") else x
7677

77-
# Build national matrix once; keep in memory for log generation
78+
# Build matrices one at a time; serialise immediately and free the
79+
# DataFrames (keeping only column/index metadata for log reconstruction).
80+
7881
m_nat, y_nat = create_national_target_matrix(frs.copy())
79-
b_m_nat = _dump(_arr(m_nat))
80-
b_y_nat = _dump(_arr(y_nat))
82+
m_nat_np = _arr(m_nat)
83+
y_nat_np = _arr(y_nat)
84+
m_nat_cols = list(m_nat.columns)
85+
y_nat_index = list(y_nat.index)
86+
b_m_nat = _dump(m_nat_np)
87+
b_y_nat = _dump(y_nat_np)
88+
del m_nat, y_nat
89+
gc.collect()
90+
91+
frs_copy = frs.copy()
92+
matrix_c, y_c, r_c = create_constituency_target_matrix(frs_copy)
93+
matrix_c_np = _arr(matrix_c)
94+
y_c_np = _arr(y_c)
95+
matrix_c_cols = list(matrix_c.columns)
96+
y_c_cols = list(y_c.columns)
97+
wi_c = _build_weights_init(frs_copy, 650, r_c)
98+
b_matrix_c = _dump(matrix_c_np)
99+
b_y_c = _dump(y_c_np)
100+
b_wi_c = _dump(wi_c)
101+
b_r_c = _dump(r_c)
102+
del matrix_c, y_c, wi_c, r_c, frs_copy
103+
gc.collect()
104+
105+
frs_copy = frs.copy()
106+
matrix_la, y_la, r_la = create_local_authority_target_matrix(frs_copy)
107+
matrix_la_np = _arr(matrix_la)
108+
y_la_np = _arr(y_la)
109+
matrix_la_cols = list(matrix_la.columns)
110+
y_la_cols = list(y_la.columns)
111+
wi_la = _build_weights_init(frs_copy, 360, r_la)
112+
b_matrix_la = _dump(matrix_la_np)
113+
b_y_la = _dump(y_la_np)
114+
b_wi_la = _dump(wi_la)
115+
b_r_la = _dump(r_la)
116+
del matrix_la, y_la, wi_la, r_la, frs_copy
117+
gc.collect()
81118

82119
with app.run():
83-
# Constituency: build, spawn, keep matrices for log, free before LA
84-
frs_copy = frs.copy()
85-
matrix_c, y_c, r_c = create_constituency_target_matrix(frs_copy)
86-
wi_c = _build_weights_init(frs_copy, 650, r_c)
87120
fut_c = run_calibration.spawn(
88-
_dump(_arr(matrix_c)),
89-
_dump(_arr(y_c)),
90-
_dump(r_c),
91-
b_m_nat,
92-
b_y_nat,
93-
_dump(wi_c),
94-
epochs,
121+
b_matrix_c, b_y_c, b_r_c, b_m_nat, b_y_nat, b_wi_c, epochs
95122
)
96-
del wi_c, r_c, frs_copy
97-
gc.collect()
98-
99-
# LA: build, spawn, keep matrices for log
100-
frs_copy = frs.copy()
101-
matrix_la, y_la, r_la = create_local_authority_target_matrix(frs_copy)
102-
wi_la = _build_weights_init(frs_copy, 360, r_la)
103123
fut_la = run_calibration.spawn(
104-
_dump(_arr(matrix_la)),
105-
_dump(_arr(y_la)),
106-
_dump(r_la),
107-
b_m_nat,
108-
b_y_nat,
109-
_dump(wi_la),
110-
epochs,
124+
b_matrix_la, b_y_la, b_r_la, b_m_nat, b_y_nat, b_wi_la, epochs
111125
)
112-
del wi_la, r_la, frs_copy
126+
del b_r_c, b_wi_c, b_r_la, b_wi_la
113127
gc.collect()
114128

115129
checkpoints_c = fut_c.get()
116130
checkpoints_la = fut_la.get()
117131

132+
# Reconstruct DataFrames with correct labels for get_performance
133+
matrix_c_df = pd.DataFrame(matrix_c_np, columns=matrix_c_cols)
134+
y_c_df = pd.DataFrame(y_c_np, columns=y_c_cols)
135+
m_nat_df = pd.DataFrame(m_nat_np, columns=m_nat_cols)
136+
y_nat_df = pd.Series(y_nat_np, index=y_nat_index)
137+
matrix_la_df = pd.DataFrame(matrix_la_np, columns=matrix_la_cols)
138+
y_la_df = pd.DataFrame(y_la_np, columns=y_la_cols)
139+
118140
weights_c = _build_log(
119141
checkpoints_c,
120142
get_constituency_performance,
121-
matrix_c,
122-
y_c,
123-
m_nat,
124-
y_nat,
143+
matrix_c_df,
144+
y_c_df,
145+
m_nat_df,
146+
y_nat_df,
125147
"constituency_calibration_log.csv",
126148
)
127149
weights_la = _build_log(
128150
checkpoints_la,
129151
get_la_performance,
130-
matrix_la,
131-
y_la,
132-
m_nat,
133-
y_nat,
152+
matrix_la_df,
153+
y_la_df,
154+
m_nat_df,
155+
y_nat_df,
134156
"la_calibration_log.csv",
135157
)
136158

0 commit comments

Comments
 (0)