Skip to content

Commit 9e35350

Browse files
authored
Merge pull request #641 from effigies/fix/deterministic_derivs
RF: Write derivatives once, using deterministic gzip settings
2 parents 11ac7a1 + 8595e94 commit 9e35350

File tree

3 files changed

+116
-20
lines changed

3 files changed

+116
-20
lines changed

niworkflows/interfaces/bids.py

Lines changed: 29 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
from nipype.interfaces.io import add_traits
3333
from templateflow.api import templates as _get_template_list
3434
from ..utils.bids import _init_layout, relative_to_root
35-
from ..utils.images import overwrite_header
35+
from ..utils.images import set_consumables, unsafe_write_nifti_header_and_data
3636
from ..utils.misc import splitext as _splitext, _copy_any
3737

3838
regz = re.compile(r"\.gz$")
@@ -576,16 +576,18 @@ def _run_interface(self, runtime):
576576
out_file = out_path / dest_file
577577
out_file.parent.mkdir(exist_ok=True, parents=True)
578578
self._results["out_file"].append(str(out_file))
579-
self._results["compression"].append(_copy_any(orig_file, str(out_file)))
579+
self._results["compression"].append(str(dest_file).endswith(".gz"))
580+
581+
# Set data and header iff changes need to be made. If these are
582+
# still None when it's time to write, just copy.
583+
new_data, new_header = None, None
580584

581585
is_nifti = out_file.name.endswith(
582586
(".nii", ".nii.gz")
583587
) and not out_file.name.endswith((".dtseries.nii", ".dtseries.nii.gz"))
584588
data_dtype = self.inputs.data_dtype or DEFAULT_DTYPES[self.inputs.suffix]
585589
if is_nifti and any((self.inputs.check_hdr, data_dtype)):
586-
# Do not use mmap; if we need to access the data at all, it will be to
587-
# rewrite, risking a BusError
588-
nii = nb.load(out_file, mmap=False)
590+
nii = nb.load(orig_file)
589591

590592
if self.inputs.check_hdr:
591593
hdr = nii.header
@@ -607,12 +609,10 @@ def _run_interface(self, runtime):
607609

608610
if curr_codes != xcodes or curr_units != units:
609611
self._results["fixed_hdr"][i] = True
610-
hdr.set_qform(nii.affine, xcodes[0])
611-
hdr.set_sform(nii.affine, xcodes[1])
612-
hdr.set_xyzt_units(*units)
613-
614-
# Rewrite file with new header
615-
overwrite_header(nii, out_file)
612+
new_header = hdr.copy()
613+
new_header.set_qform(nii.affine, xcodes[0])
614+
new_header.set_sform(nii.affine, xcodes[1])
615+
new_header.set_xyzt_units(*units)
616616

617617
if data_dtype == "source": # match source dtype
618618
try:
@@ -624,9 +624,6 @@ def _run_interface(self, runtime):
624624
data_dtype = None
625625

626626
if data_dtype:
627-
if self.inputs.check_hdr:
628-
# load updated NIfTI
629-
nii = nb.load(out_file, mmap=False)
630627
data_dtype = np.dtype(data_dtype)
631628
orig_dtype = nii.get_data_dtype()
632629
if orig_dtype != data_dtype:
@@ -639,9 +636,24 @@ def _run_interface(self, runtime):
639636
else:
640637
new_data = np.asanyarray(nii.dataobj, dtype=data_dtype)
641638
# and set header to match
642-
nii.set_data_dtype(data_dtype)
643-
nii = nii.__class__(new_data, nii.affine, nii.header)
644-
nii.to_filename(out_file)
639+
if new_header is None:
640+
new_header = nii.header.copy()
641+
new_header.set_data_dtype(data_dtype)
642+
del nii
643+
644+
if new_data is new_header is None:
645+
_copy_any(orig_file, str(out_file))
646+
else:
647+
orig_img = nb.load(orig_file)
648+
if new_data is None:
649+
set_consumables(new_header, orig_img.dataobj)
650+
new_data = orig_img.dataobj.get_unscaled()
651+
unsafe_write_nifti_header_and_data(
652+
fname=out_file,
653+
header=new_header,
654+
data=new_data
655+
)
656+
del orig_img
645657

646658
if len(self._results["out_file"]) == 1:
647659
meta_fields = self.inputs.copyable_trait_names()

niworkflows/interfaces/tests/test_bids.py

Lines changed: 80 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
import os
33
from pathlib import Path
44
import json
5+
from hashlib import sha1
56

67
import numpy as np
78
import nibabel as nb
@@ -24,61 +25,70 @@
2425

2526
@pytest.mark.parametrize("out_path_base", [None, "fmriprep"])
2627
@pytest.mark.parametrize(
27-
"source,input_files,entities,expectation",
28+
"source,input_files,entities,expectation,checksum",
2829
[
2930
(
3031
T1W_PATH,
3132
["anat.nii.gz"],
3233
{"desc": "preproc"},
3334
"sub-100185/anat/sub-100185_desc-preproc_T1w.nii.gz",
35+
"7c047921def32da260df4a985019b9f5231659fa",
3436
),
3537
(
3638
T1W_PATH,
3739
["anat.nii.gz"],
3840
{"desc": "preproc", "space": "MNI"},
3941
"sub-100185/anat/sub-100185_space-MNI_desc-preproc_T1w.nii.gz",
42+
"b22399f50ce454049d5d074457a92ab13e7fdf8c",
4043
),
4144
(
4245
T1W_PATH,
4346
["anat.nii.gz"],
4447
{"desc": "preproc", "space": "MNI", "resolution": "native"},
4548
"sub-100185/anat/sub-100185_space-MNI_desc-preproc_T1w.nii.gz",
49+
"b22399f50ce454049d5d074457a92ab13e7fdf8c",
4650
),
4751
(
4852
T1W_PATH,
4953
["anat.nii.gz"],
5054
{"desc": "preproc", "space": "MNI", "resolution": "high"},
5155
"sub-100185/anat/sub-100185_space-MNI_res-high_desc-preproc_T1w.nii.gz",
56+
"b22399f50ce454049d5d074457a92ab13e7fdf8c",
5257
),
5358
(
5459
T1W_PATH,
5560
["tfm.txt"],
5661
{"from": "fsnative", "to": "T1w", "suffix": "xfm"},
5762
"sub-100185/anat/sub-100185_from-fsnative_to-T1w_mode-image_xfm.txt",
63+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
5864
),
5965
(
6066
T1W_PATH,
6167
["tfm.h5"],
6268
{"from": "MNI152NLin2009cAsym", "to": "T1w", "suffix": "xfm"},
6369
"sub-100185/anat/sub-100185_from-MNI152NLin2009cAsym_to-T1w_mode-image_xfm.h5",
70+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
6471
),
6572
(
6673
T1W_PATH,
6774
["anat.nii.gz"],
6875
{"desc": "brain", "suffix": "mask"},
6976
"sub-100185/anat/sub-100185_desc-brain_mask.nii.gz",
77+
"d425f0096b6b6d1252973e48b31d760c0b1bdc11",
7078
),
7179
(
7280
T1W_PATH,
7381
["anat.nii.gz"],
7482
{"desc": "brain", "suffix": "mask", "space": "MNI"},
7583
"sub-100185/anat/sub-100185_space-MNI_desc-brain_mask.nii.gz",
84+
"a2a6efa16eb23173d0ee64779de879711bc74643",
7685
),
7786
(
7887
T1W_PATH,
7988
["anat.surf.gii"],
8089
{"suffix": "pial", "hemi": "L"},
8190
"sub-100185/anat/sub-100185_hemi-L_pial.surf.gii",
91+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
8292
),
8393
(
8494
T1W_PATH,
@@ -88,6 +98,8 @@
8898
f"sub-100185/anat/sub-100185_desc-{s}_dseg.nii"
8999
for s in ("aseg", "aparcaseg")
90100
],
101+
["a235cdf59f9bf077ba30bf2523a56508e3a5aabb",
102+
"a235cdf59f9bf077ba30bf2523a56508e3a5aabb"],
91103
),
92104
(
93105
T1W_PATH,
@@ -97,6 +109,8 @@
97109
f"sub-100185/anat/sub-100185_desc-preproc_T1w.{ext}"
98110
for ext in ("nii", "json")
99111
],
112+
["25c107d4a3e6f98e48aa752c5bbd88ab8e8d069f",
113+
"da39a3ee5e6b4b0d3255bfef95601890afd80709"],
100114
),
101115
(
102116
T1W_PATH,
@@ -106,63 +120,73 @@
106120
f"sub-100185/anat/sub-100185_label-{lab}_probseg.nii.gz"
107121
for lab in ("GM", "WM", "CSF")
108122
],
123+
["7c047921def32da260df4a985019b9f5231659fa"] * 3,
109124
),
110125
# BOLD data
111126
(
112127
BOLD_PATH,
113128
["aroma.csv"],
114129
{"suffix": "AROMAnoiseICs"},
115130
"sub-100185/func/sub-100185_task-machinegame_run-1_AROMAnoiseICs.csv",
131+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
116132
),
117133
(
118134
BOLD_PATH,
119135
["confounds.tsv"],
120136
{"suffix": "regressors", "desc": "confounds"},
121137
"sub-100185/func/sub-100185_task-machinegame_run-1_desc-confounds_regressors.tsv",
138+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
122139
),
123140
(
124141
BOLD_PATH,
125142
["mixing.tsv"],
126143
{"suffix": "mixing", "desc": "MELODIC"},
127144
"sub-100185/func/sub-100185_task-machinegame_run-1_desc-MELODIC_mixing.tsv",
145+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
128146
),
129147
(
130148
BOLD_PATH,
131149
["lh.func.gii"],
132150
{"space": "fsaverage", "density": "10k", "hemi": "L"},
133151
"sub-100185/func/sub-100185_task-machinegame_run-1_"
134152
"space-fsaverage_den-10k_hemi-L_bold.func.gii",
153+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
135154
),
136155
(
137156
BOLD_PATH,
138157
["hcp.dtseries.nii"],
139158
{"space": "fsLR", "density": "91k"},
140159
"sub-100185/func/sub-100185_task-machinegame_run-1_"
141160
"space-fsLR_den-91k_bold.dtseries.nii",
161+
"53d9b486d08fec5a952f68fcbcddb38a72818d4c",
142162
),
143163
(
144164
BOLD_PATH,
145165
["ref.nii"],
146166
{"space": "MNI", "suffix": "boldref"},
147167
"sub-100185/func/sub-100185_task-machinegame_run-1_space-MNI_boldref.nii",
168+
"53d9b486d08fec5a952f68fcbcddb38a72818d4c",
148169
),
149170
(
150171
BOLD_PATH,
151172
["dseg.nii"],
152173
{"space": "MNI", "suffix": "dseg", "desc": "aseg"},
153174
"sub-100185/func/sub-100185_task-machinegame_run-1_space-MNI_desc-aseg_dseg.nii",
175+
"6d2cae7f56c246d7934e2e21e7b472ecc63a4257",
154176
),
155177
(
156178
BOLD_PATH,
157179
["mask.nii"],
158180
{"space": "MNI", "suffix": "mask", "desc": "brain"},
159181
"sub-100185/func/sub-100185_task-machinegame_run-1_space-MNI_desc-brain_mask.nii",
182+
"c365991854931181a1444d6803f5289448e7e266",
160183
),
161184
(
162185
BOLD_PATH,
163186
["bold.nii"],
164187
{"space": "MNI", "desc": "preproc"},
165188
"sub-100185/func/sub-100185_task-machinegame_run-1_space-MNI_desc-preproc_bold.nii",
189+
"aa1eed935e6a8dcca646b0c78ee57218e30e2974",
166190
),
167191
# Nondeterministic order - do we really need this to work, or we can stay safe with
168192
# MapNodes?
@@ -175,30 +199,35 @@
175199
["anat.html"],
176200
{"desc": "conform", "datatype": "figures"},
177201
"sub-100185/figures/sub-100185_desc-conform_T1w.html",
202+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
178203
),
179204
(
180205
BOLD_PATH,
181206
["aroma.csv"],
182207
{"suffix": "AROMAnoiseICs", "extension": "h5"},
183208
ValueError,
209+
None,
184210
),
185211
(
186212
T1W_PATH,
187213
["anat.nii.gz"] * 3,
188214
{"desc": "preproc", "space": "MNI"},
189215
ValueError,
216+
None,
190217
),
191218
(
192219
"sub-07/ses-preop/anat/sub-07_ses-preop_T1w.nii.gz",
193220
["tfm.h5"],
194221
{"from": "orig", "to": "target", "suffix": "xfm"},
195222
"sub-07/ses-preop/anat/sub-07_ses-preop_from-orig_to-target_mode-image_xfm.h5",
223+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
196224
),
197225
(
198226
"sub-07/ses-preop/anat/sub-07_ses-preop_run-1_T1w.nii.gz",
199227
["tfm.txt"],
200228
{"from": "orig", "to": "T1w", "suffix": "xfm"},
201229
"sub-07/ses-preop/anat/sub-07_ses-preop_run-1_from-orig_to-T1w_mode-image_xfm.txt",
230+
"da39a3ee5e6b4b0d3255bfef95601890afd80709",
202231
),
203232
],
204233
)
@@ -210,6 +239,7 @@ def test_DerivativesDataSink_build_path(
210239
input_files,
211240
entities,
212241
expectation,
242+
checksum,
213243
dismiss_entities,
214244
):
215245
"""Check a few common derivatives generated by NiPreps."""
@@ -274,9 +304,12 @@ def test_DerivativesDataSink_build_path(
274304
output = dds.run().outputs.out_file
275305
if isinstance(output, str):
276306
output = [output]
307+
checksum = [checksum]
277308

278309
for out, exp in zip(output, expectation):
279310
assert Path(out).relative_to(tmp_path) == Path(base) / exp
311+
for out, chksum in zip(output, checksum):
312+
assert sha1(Path(out).read_bytes()).hexdigest() == chksum
280313

281314

282315
def test_DerivativesDataSink_dtseries_json_hack(tmp_path):
@@ -423,6 +456,52 @@ def test_DerivativesDataSink_t1w(tmp_path, space, size, units, xcodes, fixed):
423456
assert nii.header.get_xyzt_units() == ("mm", "unknown")
424457

425458

459+
@pytest.mark.parametrize("dtype", ("i2", "u2", "f4"))
460+
def test_DerivativesDataSink_values(tmp_path, dtype):
461+
# We use static checksums above, which ensures we don't break things, but
462+
# pins the tests to specific values.
463+
# Here we use random values, check that the values are preserved, and then
464+
# the checksums are unchanged across two runs.
465+
fname = str(tmp_path / "source.nii.gz")
466+
rng = np.random.default_rng()
467+
hdr = nb.Nifti1Header()
468+
hdr.set_qform(np.eye(4), code=1)
469+
hdr.set_sform(np.eye(4), code=1)
470+
nb.Nifti1Image(rng.uniform(500, 2000, (5, 5, 5)), np.eye(4), hdr).to_filename(fname)
471+
472+
orig_data = np.asanyarray(nb.load(fname).dataobj)
473+
expected = np.rint(orig_data) if dtype[0] in "iu" else orig_data
474+
475+
dds = bintfs.DerivativesDataSink(
476+
base_directory=str(tmp_path),
477+
keep_dtype=True,
478+
data_dtype=dtype,
479+
desc="preproc",
480+
source_file=T1W_PATH,
481+
in_file=fname,
482+
).run()
483+
484+
out_file = Path(dds.outputs.out_file)
485+
486+
nii = nb.load(out_file)
487+
assert np.allclose(nii.dataobj, expected)
488+
489+
checksum = sha1(out_file.read_bytes()).hexdigest()
490+
out_file.unlink()
491+
492+
# Rerun to ensure determinism with non-zero data
493+
dds = bintfs.DerivativesDataSink(
494+
base_directory=str(tmp_path),
495+
keep_dtype=True,
496+
data_dtype=dtype,
497+
desc="preproc",
498+
source_file=T1W_PATH,
499+
in_file=fname,
500+
).run()
501+
502+
assert sha1(out_file.read_bytes()).hexdigest() == checksum
503+
504+
426505
@pytest.mark.parametrize("field", ["RepetitionTime", "UndefinedField"])
427506
def test_ReadSidecarJSON_connection(testdata_dir, field):
428507
"""

niworkflows/utils/images.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
"""Utilities to manipulate images."""
22
import nibabel as nb
33
import numpy as np
4+
from gzip import GzipFile
45

56

67
def rotation2canonical(img):
@@ -33,12 +34,16 @@ def unsafe_write_nifti_header_and_data(fname, header, data):
3334
If you're not using this for NIfTI files specifically, you're playing
3435
with Fortran-ordered fire.
3536
"""
36-
# ImageOpener handles zips transparently
37-
with nb.openers.ImageOpener(fname, mode="wb") as fobj:
37+
with open(fname, "wb") as fobj:
38+
# Avoid setting fname or mtime, for deterministic outputs
39+
if str(fname).endswith(".gz"):
40+
fobj = GzipFile("", "wb", 9, fobj, 0.0)
3841
header.write_to(fobj)
3942
# This function serializes one block at a time to reduce memory usage a bit
4043
# It assumes Fortran-ordered data.
4144
nb.volumeutils.array_to_file(data, fobj, offset=header.get_data_offset())
45+
if str(fname).endswith(".gz"):
46+
fobj.close()
4247

4348

4449
def set_consumables(header, dataobj):

0 commit comments

Comments
 (0)