Skip to content

Commit e85a8dc

Browse files
authored
Merge pull request #296 from The-Strategy-Unit/295_add_metadata_to_parquet_results
add metadata to saved parquet results files
2 parents 8166493 + 93b7994 commit e85a8dc

File tree

2 files changed

+56
-6
lines changed

2 files changed

+56
-6
lines changed

model/results.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -184,12 +184,30 @@ def save_results_files(results: dict, params: dict) -> list:
184184
os.makedirs(path, exist_ok=True)
185185

186186
return [
187-
*[_save_parquet_file(path, k, v) for k, v in results.items()],
187+
*[_save_parquet_file(path, k, v, params) for k, v in results.items()],
188188
_save_params_file(path, params),
189189
]
190190

191191

192-
def _save_parquet_file(path: str, results_name: str, df: pd.DataFrame) -> str:
192+
def _add_metadata_to_dataframe(df: pd.DataFrame, params: dict) -> pd.DataFrame:
193+
"""Add metadata as columns to the dataframe, so that the saved parquet files have useful information regarding their provenance
194+
195+
:param df: The dataframe that we want to add the metadata to
196+
:type df: pd.DataFrame
197+
:param params: The parameters for the model run, which include metadata
198+
:type params: dict
199+
:return: The dataframe, with additional columns "dataset", "scenario" and "create_datetime"
200+
:rtype: pd.DataFrame
201+
"""
202+
metadata_to_save = ["dataset", "scenario", "app_version", "create_datetime"]
203+
for m in metadata_to_save:
204+
df[m] = params[m]
205+
return df
206+
207+
208+
def _save_parquet_file(
209+
path: str, results_name: str, df: pd.DataFrame, params: dict
210+
) -> str:
193211
"""Save a results dataframe as parquet
194212
195213
:param path: the folder where we want to save the results to
@@ -201,6 +219,7 @@ def _save_parquet_file(path: str, results_name: str, df: pd.DataFrame) -> str:
201219
:return: the filename of the saved file
202220
:rtype: str
203221
"""
222+
df = _add_metadata_to_dataframe(df, params)
204223
df.to_parquet(filename := f"{path}/{results_name}.parquet")
205224
return filename
206225

tests/test_results.py

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@
1616
combine_results,
1717
generate_results_json,
1818
save_results_files,
19+
_add_metadata_to_dataframe,
1920
)
2021

2122

@@ -330,25 +331,55 @@ def test_save_results_files(mocker):
330331
assert os_m.called_once_with(path, exist_ok=True)
331332

332333
assert save_parquet_mock.call_args_list == [
333-
call(path, "default", "default_df"),
334-
call(path, "step_counts", "step_counts_df"),
334+
call(path, "default", "default_df", params),
335+
call(path, "step_counts", "step_counts_df", params),
335336
]
336337

337338
assert save_params_mock.called_once_with(path, params)
338339

339340

340-
def test_save_parquet_file():
341+
def test_save_parquet_file(mocker):
341342
# arrange
342343
df = Mock()
344+
params = Mock()
345+
add_metadata_to_dataframe_mock = mocker.patch(
346+
"model.results._add_metadata_to_dataframe", return_value=df
347+
)
343348

344349
# act
345-
actual = _save_parquet_file("path", "file", df)
350+
actual = _save_parquet_file("path", "file", df, params)
346351

347352
# assert
348353
assert actual == "path/file.parquet"
354+
add_metadata_to_dataframe_mock.assert_called_once_with(df, params)
349355
df.to_parquet.assert_called_once_with(actual)
350356

351357

358+
def test_add_metadata_to_dataframe(mocker):
359+
# arrange
360+
df = pd.DataFrame({"one": [1], "two": [2]})
361+
params = {
362+
"dataset": "dataset",
363+
"scenario": "scenario",
364+
"app_version": "app_version",
365+
"create_datetime": "create_datetime",
366+
}
367+
expected = {
368+
"one": [1],
369+
"two": [2],
370+
"dataset": ["dataset"],
371+
"app_version": ["app_version"],
372+
"scenario": ["scenario"],
373+
"create_datetime": ["create_datetime"],
374+
}
375+
376+
# act
377+
actual = _add_metadata_to_dataframe(df, params)
378+
379+
# assert
380+
assert actual.to_dict("list") == expected
381+
382+
352383
def test_save_params_file(mocker):
353384
# arrange
354385
j_mock = mocker.patch("json.dump")

0 commit comments

Comments
 (0)