Skip to content

Commit 04dad22

Browse files
444 Fix multi-node issue (#446)
Fixes #444 . ### Description This PR is used to fix the multi-gpu configs issue if running with multi-node. It also has a bit enhancement on the `endoscopic_inbody_classification` to add close json file step. This bundle's unit test is also added within the PR. ### Status **Ready** ### Please ensure all the checkboxes: <!--- Put an `x` in all the boxes that apply, and remove the not applicable items --> - [x] Codeformat tests passed locally by running `./runtests.sh --codeformat`. - [ ] In-line docstrings updated. - [ ] Update `version` and `changelog` in `metadata.json` if changing an existing bundle. - [ ] Please ensure the naming rules in config files meet our requirements (please refer to: `CONTRIBUTING.md`). - [ ] Ensure versions of packages such as `monai`, `pytorch` and `numpy` are correct in `metadata.json`. - [ ] Descriptions should be consistent with the content, such as `eval_metrics` of the provided weights and TorchScript modules. - [ ] Files larger than 25MB are excluded and replaced by providing download links in `large_file.yml`. - [ ] Avoid using path that contains personal information within config files (such as use `/home/your_name/` for `"bundle_root"`). --------- Signed-off-by: Yiheng Wang <[email protected]>
1 parent ff6175a commit 04dad22

File tree

44 files changed

+289
-71
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+289
-71
lines changed

ci/unit_tests/test_brats_mri_segmentation.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -31,11 +31,11 @@
3131
]
3232

3333
TEST_CASE_2 = [ # inference
34-
{"bundle_root": "models/brats_mri_segmentation", "handlers#0#_disabled_": True} # do not load weights
34+
{"bundle_root": "models/brats_mri_segmentation", "handlers#0#_disabled_": True, "inferer#roi_size": [64, 64, 64]}
3535
]
3636

3737

38-
class TestSpleenCTSeg(unittest.TestCase):
38+
class TestBratsSeg(unittest.TestCase):
3939
def setUp(self):
4040
self.dataset_dir = tempfile.mkdtemp()
4141
dataset_size = 10

ci/unit_tests/test_brats_mri_segmentation_dist.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@
3232
]
3333

3434

35-
class TestSpleenCTSegMGPU(unittest.TestCase):
35+
class TestBratsSegMGPU(unittest.TestCase):
3636
def setUp(self):
3737
self.dataset_dir = tempfile.mkdtemp()
3838
dataset_size = 10
@@ -59,7 +59,7 @@ def tearDown(self):
5959
shutil.rmtree(self.dataset_dir)
6060

6161
@parameterized.expand([TEST_CASE_1])
62-
def test_train_eval_mgpu_config(self, override):
62+
def test_train_mgpu_config(self, override):
6363
override["dataset_dir"] = self.dataset_dir
6464
bundle_root = override["bundle_root"]
6565
train_file = os.path.join(bundle_root, "configs/train.json")
Lines changed: 125 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,125 @@
1+
# Copyright (c) MONAI Consortium
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import os
13+
import shutil
14+
import subprocess
15+
import tempfile
16+
import unittest
17+
18+
import numpy as np
19+
from monai.bundle import ConfigWorkflow
20+
from monai.data import PILWriter
21+
from parameterized import parameterized
22+
23+
TEST_CASE_1 = [ # train, evaluate
24+
{
25+
"bundle_root": "models/endoscopic_inbody_classification",
26+
"train#trainer#max_epochs": 2,
27+
"train#dataloader#num_workers": 1,
28+
"validate#dataloader#num_workers": 1,
29+
"train#deterministic_transforms#3#spatial_size": [32, 32],
30+
}
31+
]
32+
33+
TEST_CASE_2 = [ # inference
34+
{
35+
"bundle_root": "models/endoscopic_inbody_classification",
36+
"handlers#0#_disabled_": True,
37+
"preprocessing#transforms#2#spatial_size": [32, 32],
38+
}
39+
]
40+
41+
42+
class TestEndoscopicCls(unittest.TestCase):
43+
def setUp(self):
44+
self.dataset_dir = tempfile.mkdtemp()
45+
dataset_size = 10
46+
writer = PILWriter(np.uint8)
47+
shape = (3, 256, 256)
48+
for sub_folder in ["inbody", "outbody"]:
49+
sample_dir = os.path.join(self.dataset_dir, sub_folder)
50+
os.makedirs(sample_dir)
51+
for s in range(dataset_size):
52+
image = np.random.randint(low=0, high=5, size=shape).astype(np.int8)
53+
image_filename = os.path.join(sample_dir, f"{sub_folder}_{s}.jpg")
54+
writer.set_data_array(image, channel_dim=0)
55+
writer.write(image_filename, verbose=True)
56+
57+
prepare_datalist_file = "models/endoscopic_inbody_classification/scripts/data_process.py"
58+
outpath = "models/endoscopic_inbody_classification/label"
59+
cmd = f"python {prepare_datalist_file} --datapath {self.dataset_dir} --outpath {outpath}"
60+
call_status = subprocess.run(cmd, shell=True)
61+
call_status.check_returncode()
62+
63+
def tearDown(self):
64+
shutil.rmtree(self.dataset_dir)
65+
66+
@parameterized.expand([TEST_CASE_1])
67+
def test_train_eval_config(self, override):
68+
override["dataset_dir"] = self.dataset_dir
69+
bundle_root = override["bundle_root"]
70+
train_file = os.path.join(bundle_root, "configs/train.json")
71+
eval_file = os.path.join(bundle_root, "configs/evaluate.json")
72+
73+
trainer = ConfigWorkflow(
74+
workflow="train",
75+
config_file=train_file,
76+
logging_file=os.path.join(bundle_root, "configs/logging.conf"),
77+
meta_file=os.path.join(bundle_root, "configs/metadata.json"),
78+
**override,
79+
)
80+
trainer.initialize()
81+
# check required and optional properties
82+
check_result = trainer.check_properties()
83+
if check_result is not None and len(check_result) > 0:
84+
raise ValueError(f"check properties for train config failed: {check_result}")
85+
trainer.run()
86+
trainer.finalize()
87+
88+
validator = ConfigWorkflow(
89+
# override train.json, thus set the workflow to "train" rather than "eval"
90+
workflow="train",
91+
config_file=[train_file, eval_file],
92+
logging_file=os.path.join(bundle_root, "configs/logging.conf"),
93+
meta_file=os.path.join(bundle_root, "configs/metadata.json"),
94+
**override,
95+
)
96+
validator.initialize()
97+
check_result = validator.check_properties()
98+
if check_result is not None and len(check_result) > 0:
99+
raise ValueError(f"check properties for overrided train config failed: {check_result}")
100+
validator.run()
101+
validator.finalize()
102+
103+
@parameterized.expand([TEST_CASE_2])
104+
def test_infer_config(self, override):
105+
override["dataset_dir"] = self.dataset_dir
106+
bundle_root = override["bundle_root"]
107+
108+
inferrer = ConfigWorkflow(
109+
workflow="infer",
110+
config_file=os.path.join(bundle_root, "configs/inference.json"),
111+
logging_file=os.path.join(bundle_root, "configs/logging.conf"),
112+
meta_file=os.path.join(bundle_root, "configs/metadata.json"),
113+
**override,
114+
)
115+
inferrer.initialize()
116+
# check required and optional properties
117+
check_result = inferrer.check_properties()
118+
if check_result is not None and len(check_result) > 0:
119+
raise ValueError(f"check properties for inference config failed: {check_result}")
120+
inferrer.run()
121+
inferrer.finalize()
122+
123+
124+
if __name__ == "__main__":
125+
unittest.main()
Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
# Copyright (c) MONAI Consortium
2+
# Licensed under the Apache License, Version 2.0 (the "License");
3+
# you may not use this file except in compliance with the License.
4+
# You may obtain a copy of the License at
5+
# http://www.apache.org/licenses/LICENSE-2.0
6+
# Unless required by applicable law or agreed to in writing, software
7+
# distributed under the License is distributed on an "AS IS" BASIS,
8+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
9+
# See the License for the specific language governing permissions and
10+
# limitations under the License.
11+
12+
import os
13+
import shutil
14+
import subprocess
15+
import tempfile
16+
import unittest
17+
18+
import numpy as np
19+
import torch
20+
from monai.data import PILWriter
21+
from parameterized import parameterized
22+
from utils import export_config_and_run_mgpu_cmd
23+
24+
TEST_CASE_1 = [
25+
{
26+
"bundle_root": "models/endoscopic_inbody_classification",
27+
"train#trainer#max_epochs": 1,
28+
"train#dataloader#num_workers": 1,
29+
"validate#dataloader#num_workers": 1,
30+
"train#deterministic_transforms#3#spatial_size": [32, 32],
31+
}
32+
]
33+
34+
35+
class TestEndoscopicClsMGPU(unittest.TestCase):
36+
def setUp(self):
37+
self.dataset_dir = tempfile.mkdtemp()
38+
dataset_size = 10
39+
writer = PILWriter(np.uint8)
40+
shape = (3, 256, 256)
41+
for sub_folder in ["inbody", "outbody"]:
42+
sample_dir = os.path.join(self.dataset_dir, sub_folder)
43+
os.makedirs(sample_dir)
44+
for s in range(dataset_size):
45+
image = np.random.randint(low=0, high=5, size=shape).astype(np.int8)
46+
image_filename = os.path.join(sample_dir, f"{sub_folder}_{s}.jpg")
47+
writer.set_data_array(image, channel_dim=0)
48+
writer.write(image_filename, verbose=True)
49+
50+
prepare_datalist_file = "models/endoscopic_inbody_classification/scripts/data_process.py"
51+
outpath = "models/endoscopic_inbody_classification/label"
52+
cmd = f"python {prepare_datalist_file} --datapath {self.dataset_dir} --outpath {outpath}"
53+
call_status = subprocess.run(cmd, shell=True)
54+
call_status.check_returncode()
55+
56+
def tearDown(self):
57+
shutil.rmtree(self.dataset_dir)
58+
59+
@parameterized.expand([TEST_CASE_1])
60+
def test_train_mgpu_config(self, override):
61+
override["dataset_dir"] = self.dataset_dir
62+
bundle_root = override["bundle_root"]
63+
train_file = os.path.join(bundle_root, "configs/train.json")
64+
mgpu_train_file = os.path.join(bundle_root, "configs/multi_gpu_train.json")
65+
output_path = os.path.join(bundle_root, "configs/train_override.json")
66+
n_gpu = torch.cuda.device_count()
67+
export_config_and_run_mgpu_cmd(
68+
config_file=[train_file, mgpu_train_file],
69+
logging_file=os.path.join(bundle_root, "configs/logging.conf"),
70+
meta_file=os.path.join(bundle_root, "configs/metadata.json"),
71+
override_dict=override,
72+
output_path=output_path,
73+
ngpu=n_gpu,
74+
)
75+
76+
77+
if __name__ == "__main__":
78+
unittest.main()

ci/unit_tests/test_spleen_ct_segmentation.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
"bundle_root": "models/spleen_ct_segmentation",
3939
"datalist": "$list(sorted(glob.glob(@dataset_dir + '/image_*.nii.gz')))",
4040
"handlers#0#_disabled_": True, # do not load weights
41+
"inferer#roi_size": [32, 32, 32],
4142
}
4243
]
4344

ci/unit_tests/test_spleen_ct_segmentation_dist.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -52,7 +52,7 @@ def tearDown(self):
5252
shutil.rmtree(self.dataset_dir)
5353

5454
@parameterized.expand([TEST_CASE_1])
55-
def test_train_eval_mgpu_config(self, override):
55+
def test_train_mgpu_config(self, override):
5656
override["dataset_dir"] = self.dataset_dir
5757
bundle_root = override["bundle_root"]
5858
train_file = os.path.join(bundle_root, "configs/train.json")

ci/unit_tests/utils.py

Lines changed: 1 addition & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -15,17 +15,6 @@
1515
from monai.bundle import ConfigParser
1616

1717

18-
def run_command(cmd):
19-
process = subprocess.Popen(cmd, stdout=subprocess.PIPE, universal_newlines=True)
20-
while process.poll() is None:
21-
line = process.stdout.readline()
22-
line = line.rstrip()
23-
if line:
24-
print(line, flush=True)
25-
print((f"Return code: {process.returncode}"))
26-
process.stdout.close()
27-
28-
2918
def export_overrided_config(config_file, override_dict, output_path):
3019
parser = ConfigParser()
3120
parser.read_config(config_file)
@@ -65,4 +54,4 @@ def export_config_and_run_mgpu_cmd(
6554
cmd = produce_mgpu_cmd(
6655
config_file=output_path, meta_file=meta_file, logging_file=logging_file, nnodes=nnode, nproc_per_node=ngpu
6756
)
68-
run_command(cmd)
57+
subprocess.check_call(cmd)

models/brats_mri_axial_slices_generative_diffusion/configs/metadata.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
{
22
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
3-
"version": "1.0.4",
3+
"version": "1.0.5",
44
"changelog": {
5+
"1.0.5": "fix the wrong GPU index issue of multi-node",
56
"1.0.4": "update with new lr scheduler api",
67
"1.0.3": "update required packages",
78
"1.0.2": "remove unused saver in inference",
89
"1.0.1": "fix inference folder error",
910
"1.0.0": "Initial release"
1011
},
11-
"monai_version": "1.2.0rc7",
12+
"monai_version": "1.2.0",
1213
"pytorch_version": "1.13.1",
1314
"numpy_version": "1.22.2",
1415
"optional_packages_version": {

models/brats_mri_axial_slices_generative_diffusion/configs/multi_gpu_train_autoencoder.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
{
2-
"device": "$torch.device(f'cuda:{dist.get_rank()}')",
2+
"device": "$torch.device('cuda:' + os.environ['LOCAL_RANK'])",
33
"gnetwork": {
44
"_target_": "torch.nn.parallel.DistributedDataParallel",
55
"module": "$@autoencoder_def.to(@device)",
@@ -27,6 +27,7 @@
2727
"train#trainer#train_handlers": "$@train#handlers[: -2 if dist.get_rank() > 0 else None]",
2828
"initialize": [
2929
"$import torch.distributed as dist",
30+
"$import os",
3031
"$dist.is_initialized() or dist.init_process_group(backend='nccl')",
3132
"$torch.cuda.set_device(@device)",
3233
"$monai.utils.set_determinism(seed=123)",

models/brats_mri_generative_diffusion/configs/metadata.json

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,15 @@
11
{
22
"schema": "https://github.com/Project-MONAI/MONAI-extra-test-data/releases/download/0.8.1/meta_schema_generator_ldm_20230507.json",
3-
"version": "1.0.4",
3+
"version": "1.0.5",
44
"changelog": {
5+
"1.0.5": "fix the wrong GPU index issue of multi-node",
56
"1.0.4": "update with new lr scheduler api",
67
"1.0.3": "update required packages",
78
"1.0.2": "unify dataset dir in different configs",
89
"1.0.1": "update dependency, update trained model weights",
910
"1.0.0": "Initial release"
1011
},
11-
"monai_version": "1.2.0rc7",
12+
"monai_version": "1.2.0",
1213
"pytorch_version": "1.13.1",
1314
"numpy_version": "1.22.2",
1415
"optional_packages_version": {

0 commit comments

Comments
 (0)