Skip to content

Commit 02efc60

Browse files
authored
3212 3243 Add utility to check missing files in datalist (#3247)
* [DLMED] add check missing files Signed-off-by: Nic Ma <[email protected]> * [DLMED] enhance PILReader doc Signed-off-by: Nic Ma <[email protected]> * [DLMED] add root dir Signed-off-by: Nic Ma <[email protected]>
1 parent ca15762 commit 02efc60

File tree

6 files changed

+99
-1
lines changed

6 files changed

+99
-1
lines changed

docs/source/data.rst

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -200,6 +200,8 @@ DatasetSummary
200200
Decathlon Datalist
201201
~~~~~~~~~~~~~~~~~~
202202
.. autofunction:: monai.data.load_decathlon_datalist
203+
.. autofunction:: monai.data.load_decathlon_properties
204+
.. autofunction:: monai.data.check_missing_files
203205

204206

205207
DataLoader

monai/data/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
ZipDataset,
2525
)
2626
from .dataset_summary import DatasetSummary
27-
from .decathlon_datalist import load_decathlon_datalist, load_decathlon_properties
27+
from .decathlon_datalist import check_missing_files, load_decathlon_datalist, load_decathlon_properties
2828
from .grid_dataset import GridPatchDataset, PatchDataset, PatchIter
2929
from .image_dataset import ImageDataset
3030
from .image_reader import ImageReader, ITKReader, NibabelReader, NumpyReader, PILReader, WSIReader

monai/data/decathlon_datalist.py

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
import os
1414
from typing import Dict, List, Optional, Sequence, Union, overload
1515

16+
from monai.config import KeysCollection
1617
from monai.utils import ensure_tuple
1718

1819

@@ -148,3 +149,40 @@ def load_decathlon_properties(data_property_file_path: str, property_keys: Union
148149
raise KeyError(f"key {key} is not in the data property file.")
149150
properties[key] = json_data[key]
150151
return properties
152+
153+
154+
def check_missing_files(
155+
datalist: List[Dict], keys: KeysCollection, root_dir: Optional[str] = None, allow_missing_keys: bool = False
156+
):
157+
"""Checks whether some files in the Decathlon datalist are missing.
158+
It would be helpful to check missing files before a heavy training run.
159+
160+
Args:
161+
datalist: a list of data items, every item is a dictionary.
162+
ususally generated by `load_decathlon_datalist` API.
163+
keys: expected keys to check in the datalist.
164+
root_dir: if not None, provides the root dir for the relative file paths in `datalist`.
165+
allow_missing_keys: whether allow missing keys in the datalist items.
166+
if False, raise exception if missing. default to False.
167+
168+
Returns:
169+
A list of missing filenames.
170+
171+
"""
172+
missing_files = []
173+
for item in datalist:
174+
for k in ensure_tuple(keys):
175+
if k not in item:
176+
if not allow_missing_keys:
177+
raise ValueError(f"key `{k}` is missing in the datalist item: {item}")
178+
continue
179+
180+
for f in ensure_tuple(item[k]):
181+
if not isinstance(f, str):
182+
raise ValueError(f"filepath of key `{k}` must be a string or a list of strings, but got: {f}.")
183+
if isinstance(root_dir, str):
184+
f = os.path.join(root_dir, f)
185+
if not os.path.exists(f):
186+
missing_files.append(f)
187+
188+
return missing_files

monai/data/image_reader.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -627,6 +627,8 @@ def get_data(self, img):
627627
It computes `spatial_shape` and stores it in meta dict.
628628
When loading a list of files, they are stacked together at a new dimension as the first dimension,
629629
and the meta data of the first image is used to represent the output meta data.
630+
Note that it will switch axis 0 and 1 after loading the array because the `HW` definition in PIL
631+
is different from other common medical packages.
630632
631633
Args:
632634
img: a PIL Image object loaded from a file or a list of PIL Image objects.

tests/min_tests.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ def run_testsuit():
147147
"test_handler_mlflow",
148148
"test_prepare_batch_extra_input",
149149
"test_prepare_batch_default",
150+
"test_check_missing_files",
150151
]
151152
assert sorted(exclude_cases) == sorted(set(exclude_cases)), f"Duplicated items in {exclude_cases}"
152153

tests/test_check_missing_files.py

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# Copyright 2020 - 2021 MONAI Consortium
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import os
13+
import tempfile
14+
import unittest
15+
16+
import nibabel as nib
17+
import numpy as np
18+
19+
from monai.data import check_missing_files
20+
21+
22+
class TestCheckMissingFiles(unittest.TestCase):
23+
def test_content(self):
24+
test_image = nib.Nifti1Image(np.random.randint(0, 2, size=[128, 128, 128]), np.eye(4))
25+
with tempfile.TemporaryDirectory() as tempdir:
26+
nib.save(test_image, os.path.join(tempdir, "test_image1.nii.gz"))
27+
nib.save(test_image, os.path.join(tempdir, "test_label1.nii.gz"))
28+
nib.save(test_image, os.path.join(tempdir, "test_extra1.nii.gz"))
29+
nib.save(test_image, os.path.join(tempdir, "test_image2.nii.gz"))
30+
31+
datalist = [
32+
{
33+
"image": os.path.join(tempdir, "test_image1.nii.gz"),
34+
"label": [os.path.join(tempdir, "test_label1.nii.gz"), os.path.join(tempdir, "test_extra1.nii.gz")],
35+
},
36+
{
37+
"image": os.path.join(tempdir, "test_image2.nii.gz"),
38+
"label": os.path.join(tempdir, "test_label_missing.nii.gz"),
39+
},
40+
]
41+
42+
missings = check_missing_files(datalist=datalist, keys=["image", "label"])
43+
self.assertEqual(len(missings), 1)
44+
self.assertEqual(missings[0], os.path.join(tempdir, "test_label_missing.nii.gz"))
45+
46+
# test with missing key and relative path
47+
datalist = [{"image": "test_image1.nii.gz", "label": "test_label_missing.nii.gz"}]
48+
missings = check_missing_files(
49+
datalist=datalist, keys=["image", "label", "test"], root_dir=tempdir, allow_missing_keys=True
50+
)
51+
self.assertEqual(missings[0], os.path.join(tempdir, "test_label_missing.nii.gz"))
52+
53+
54+
if __name__ == "__main__":
55+
unittest.main()

0 commit comments

Comments
 (0)