FeTS-AI · aristizabal95 · Jan 19, 2024 · Jan 25, 2024 · Jan 25, 2024 · Jan 25, 2024
diff --git a/Dockerfile b/Dockerfile
@@ -5,8 +5,8 @@ LABEL authors="FeTS_Admin <[email protected]>"
 RUN apt-get update && apt-get update --fix-missing && apt-get install -y libnss3 libnspr4 libxcursor1 libxcursor-dev libasound2 libdbus-1-dev libglfw3-dev libgles2-mesa-dev ffmpeg libsm6 libxext6 python3.8 python3.8-venv python3.8-dev python3-setuptools
 
 ENV PATH=/CaPTk/bin/qt/5.12.1/bin:/CaPTk/bin/qt/5.12.1/libexec:$PATH
-ENV CMAKE_PREFIX_PATH=/CaPTk/bin/ITK-build:/CaPTk/bin/DCMTK-build:/CaPTk/bin/qt/5.12.1/lib/cmake/Qt5:$CMAKE_PREFIX_PATH
-
+ENV CMAKE_PREFIX_PATH=/CaPTk/bin/ITK-build:/CaPTk/bin/DCMTK-build:/CaPTk/bin/qt/5.12.1/lib/cmake/Qt5
+ENV DCMTK_DIR=/CaPTk/bin/DCMTK-build
 RUN pwd && ls -l
 
 WORKDIR /Front-End
@@ -88,13 +88,10 @@ RUN pip install git+https://github.com/mlcommons/GaNDLF@616b37bafad8f89d5c816a88
 # setup a separate env for nnunet
 RUN python -m venv /nnunet_env && /nnunet_env/bin/pip install --upgrade pip
 
-RUN /nnunet_env/bin/pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102
+RUN /nnunet_env/bin/pip install torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/cu118
 
 RUN /nnunet_env/bin/pip install git+https://github.com/MIC-DKFZ/nnUNet.git@nnunetv1
 
-ENV nnUNet_raw_data_base="/tmp/nnUNet_raw_data_base"
-ENV nnUNet_preprocessed="/tmp/nnUNet_preprocessed"
-# see https://docs.docker.com/config/containers/resource_constraints/#gpu for detailed explanation
 ENV CUDA_VISIBLE_DEVICES="0"
 
 COPY ./mlcubes/data_preparation/project /project

diff --git a/docs/assets/img/rano_docker.png b/docs/assets/img/rano_docker.png
diff --git a/mlcubes/.gitignore b/mlcubes/.gitignore
@@ -7,4 +7,5 @@
 */mlcube/workspace/*
 !requirements.txt
 !*/mlcube/workspace/parameters.yaml
-models
+models
+tmpmodel
-tmpmodel
+tmpmodel
+
-tmpmodel
+tmpmodel
+
diff --git a/mlcubes/data_preparation/mlcube/clean.sh b/mlcubes/data_preparation/mlcube/clean.sh
@@ -0,0 +1,5 @@
+rm -rf workspace/data
+rm -rf workspace/labels
+rm -rf workspace/metadata
+rm -rf workspace/report
+rm -rf workspace/statistics
diff --git a/mlcubes/data_preparation/mlcube/mlcube.yaml b/mlcubes/data_preparation/mlcube/mlcube.yaml
@@ -8,7 +8,7 @@ platform:
 
 docker:
   # Image name
-  image: mlcommons/rano-data-prep-mlcube:latest
+  image: mlcommons/rano-data-prep-mlcube:1.0.10
   # Docker build context relative to $MLCUBE_ROOT. Default is `build`.
   build_context: "../project"
   # Docker file name within docker build context, default is `Dockerfile`.

diff --git a/mlcubes/data_preparation/mlcube/tests.sh b/mlcubes/data_preparation/mlcube/tests.sh
@@ -0,0 +1,63 @@
+
+DATA=./workspace/data
+
+run() {
+mlcube run --mlcube ./mlcube.yaml --task prepare --network=none --mount=ro \
+        report_file=report/report.yaml \
+        labels_path=input_data \
+        -Pdocker.cpu_args="-u $(id -u):$(id -g)" \
+        -Pdocker.gpu_args="-u $(id -u):$(id -g)"
+}
+
+run_other() {
+mlcube run --mlcube ./mlcube.yaml --task sanity_check --network=none --mount=ro \
+        -Pdocker.cpu_args="-u $(id -u):$(id -g)" \
+        -Pdocker.gpu_args="-u $(id -u):$(id -g)"
+
+mlcube run --mlcube ./mlcube.yaml --task statistics --network=none --mount=ro \
+        output_path=statistics/statistics.yaml \
+        -Pdocker.cpu_args="-u $(id -u):$(id -g)" \
+        -Pdocker.gpu_args="-u $(id -u):$(id -g)"
+}
+
+STARTTIME=$(date +%s.%N)
+
+
+run
+
+# manual review
+cp $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz \
+   $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/finalized/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz
+
+cp $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz \
+   $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/finalized/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz
+
+cp $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz \
+   $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/finalized/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz
+# end manual review
+
+run &
+PID=$!
+
+# prompt response
+BREAK=0
+while [ $BREAK -eq "0" ]
+do
+if [ -f $DATA/".prompt.txt" ];
+    then BREAK=1;
+else
+    sleep 0.1s;
+fi
+
+done
+
+echo -n "y" >> $DATA/.response.txt
+# end prompt response
+
+wait ${PID}
+
+ENDTIME=$(date +%s.%N)
+DIFF=$(echo "$ENDTIME - $STARTTIME" | bc)
+echo $DIFF
+
+run_other
-run_other
+run_other
+
-run_other
+run_other
+
diff --git a/mlcubes/data_preparation/mlcube/tests_sing.sh b/mlcubes/data_preparation/mlcube/tests_sing.sh
@@ -0,0 +1,62 @@
+
+DATA=./workspace/data
+
+run() {
+mlcube run --mlcube ./mlcube.yaml --task prepare --network=none --mount=ro --platform=singularity \
+        report_file=report/report.yaml \
+        labels_path=input_data \
+        -Psingularity.run_args="-nce"
+}
+
+run_other() {
+mlcube run --mlcube ./mlcube.yaml --task sanity_check --network=none --mount=ro --platform=singularity \
+        -Psingularity.run_args="-nce"
+
+
+mlcube run --mlcube ./mlcube.yaml --task statistics --network=none --mount=ro --platform=singularity \
+        output_path=statistics/statistics.yaml \
+        -Psingularity.run_args="-nce"
+
+}
+
+STARTTIME=$(date +%s.%N)
+
+
+run
+
+# manual review
+cp $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz \
+   $DATA/tumor_extracted/DataForQC/AAAC_1/2008.03.31/TumorMasksForQC/finalized/AAAC_1_2008.03.31_tumorMask_model_0.nii.gz
+
+cp $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz \
+   $DATA/tumor_extracted/DataForQC/AAAC_1/2012.01.02/TumorMasksForQC/finalized/AAAC_1_2012.01.02_tumorMask_model_0.nii.gz
+
+cp $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz \
+   $DATA/tumor_extracted/DataForQC/AAAC_2/2001.01.01/TumorMasksForQC/finalized/AAAC_2_2001.01.01_tumorMask_model_0.nii.gz
+# end manual review
+
+run &
+PID=$!
+
+# prompt response
+BREAK=0
+while [ $BREAK -eq "0" ]
+do
+if [ -f $DATA/".prompt.txt" ];
+    then BREAK=1;
+else
+    sleep 0.1s;
+fi
+
+done
+
+echo -n "y" >> $DATA/.response.txt
+# end prompt response
+
+wait ${PID}
+
+ENDTIME=$(date +%s.%N)
+DIFF=$(echo "$ENDTIME - $STARTTIME" | bc)
+echo $DIFF
+
+run_other
-run_other
+run_other
+
-run_other
+run_other
+
diff --git a/mlcubes/data_preparation/project/Dockerfile.dev b/mlcubes/data_preparation/project/Dockerfile.dev
@@ -0,0 +1,17 @@
+FROM hasan7/baselocal:0.0.0
+
+COPY ./atlasImage_0.125.nii.gz /project
+COPY ./tmpmodel /project
+
+# use a downsampled reference image for DICOM to NIFTI conversion
+RUN mv /project/atlasImage_0.125.nii.gz /Front-End/bin/install/appdir/usr/data/sri24/atlasImage.nii.gz
+
+# remove heavy brain extraction models
+RUN rm -rf /project/stages/data_prep_models/brain_extraction/model_0/
+RUN rm -rf /project/stages/data_prep_models/brain_extraction/model_1/
+
+# use dummy brain extraction models
+RUN cp -r /project/tmpmodel /project/stages/data_prep_models/brain_extraction/model_0
+RUN mv /project/tmpmodel /project/stages/data_prep_models/brain_extraction/model_1
+
+ENTRYPOINT ["python", "/project/mlcube.py"]
-ENTRYPOINT ["python", "/project/mlcube.py"]
+ENTRYPOINT ["python", "/project/mlcube.py"]
+
-ENTRYPOINT ["python", "/project/mlcube.py"]
+ENTRYPOINT ["python", "/project/mlcube.py"]
+
diff --git a/mlcubes/data_preparation/project/README.md b/mlcubes/data_preparation/project/README.md
@@ -0,0 +1,12 @@
+# How to run tests
+
+1. Download and extract (sha256: 701fbba8b253fc5b2f54660837c493a38dec986df9bdbf3d97f07c8bc276a965):
+<https://storage.googleapis.com/medperf-storage/rano_test_assets/dev.tar.gz>
+
+2. Move `additional_files` and `input_data` to the mlcube workspace
+3. Move `tmpmodel` and `atlasImage_0.125.nii.gz` to the mlcube project folder
+
+4. Build the base docker image from the repo's root folder Dockerfile
+5. Build the dev docker image using `Dockerfile.dev` in the mlcube project folder.
+6. Then change the docker image name in `mlcube.yaml` according to step 5.
+7. Then go to `mlcube` folder and run the tests scripts
diff --git a/mlcubes/data_preparation/project/mlcube.py b/mlcubes/data_preparation/project/mlcube.py
@@ -1,12 +1,13 @@
 """MLCube handler file"""
+import os
 import typer
 import subprocess
-
+import shutil
 
 app = typer.Typer()
 
 
-def exec_python(cmd: str) -> None:
+def exec_python(cmd: str, check_for_failure=True) -> None:
     """Execute a python script as a subprocess
 
     Args:
@@ -15,7 +16,8 @@ def exec_python(cmd: str) -> None:
     splitted_cmd = cmd.split()
     process = subprocess.Popen(splitted_cmd, cwd=".")
     process.wait()
-    assert process.returncode == 0, f"command failed: {cmd}"
+    if check_for_failure:
+        assert process.returncode == 0, f"command failed: {cmd}"
 
 
 @app.command("prepare")
@@ -29,10 +31,9 @@ def prepare(
     report_file: str = typer.Option(..., "--report_file"),
     metadata_path: str = typer.Option(..., "--metadata_path"),
 ):
-    cmd = f"python3 project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}"
+    cmd = f"python3 /project/prepare.py --data_path={data_path} --labels_path={labels_path} --models_path={models_path} --data_out={output_path} --labels_out={output_labels_path} --report={report_file} --parameters={parameters_file} --metadata_path={metadata_path}"
     exec_python(cmd)
 
-
 @app.command("sanity_check")
 def sanity_check(
     data_path: str = typer.Option(..., "--data_path"),
@@ -41,7 +42,7 @@ def sanity_check(
     metadata_path: str = typer.Option(..., "--metadata_path"),
 ):
     # Modify the sanity_check command as needed
-    cmd = f"python3 project/sanity_check.py --data_path={data_path} --labels_path={labels_path} --metadata={metadata_path}"
+    cmd = f"python3 /project/sanity_check.py --data_path={data_path} --labels_path={labels_path} --metadata={metadata_path}"
     exec_python(cmd)
 
 
@@ -54,8 +55,8 @@ def sanity_check(
     out_path: str = typer.Option(..., "--output_path"),
 ):
     # Modify the statistics command as needed
-    cmd = f"python3 project/statistics.py --data_path={data_path} --labels_path={labels_path} --out_file={out_path} --metadata={metadata_path}"
-    exec_python(cmd)
+    cmd = f"python3 /project/statistics.py --data_path={data_path} --labels_path={labels_path} --out_file={out_path} --metadata={metadata_path}"
+    exec_python(cmd, check_for_failure=False) # Don't throw an error if it fails, to avoid traceback and confusion from users
 
 
 if __name__ == "__main__":

diff --git a/mlcubes/data_preparation/project/prepare.py b/mlcubes/data_preparation/project/prepare.py
@@ -1,4 +1,5 @@
 import os
+import shutil
 import argparse
 import pandas as pd
 import yaml
@@ -157,10 +158,25 @@ def init_report(args) -> pd.DataFrame:
 def main():
     args = setup_argparser()
 
-    os.environ["RESULTS_FOLDER"] = os.path.join(args.models, "nnUNet_trained_models")
+    output_path = args.data_out
+    models_path = args.models
+
+    tmpfolder = os.path.join(output_path, ".tmp")
+    cbica_tmpfolder = os.path.join(tmpfolder, ".cbicaTemp")
+    os.environ["TMPDIR"] = tmpfolder
+    os.environ["CBICA_TEMP_DIR"] = cbica_tmpfolder
+    os.makedirs(tmpfolder, exist_ok=True)
+    os.makedirs(cbica_tmpfolder, exist_ok=True)
+    os.environ["RESULTS_FOLDER"] = os.path.join(models_path, "nnUNet_trained_models")
+    os.environ["nnUNet_raw_data_base"] = os.path.join(tmpfolder, "nnUNet_raw_data_base")
+    os.environ["nnUNet_preprocessed"] = os.path.join(tmpfolder, "nnUNet_preprocessed")
+
     report = init_report(args)
     pipeline = init_pipeline(args)
     pipeline.run(report, args.report)
 
+    # cleanup tmp folder
+    shutil.rmtree(tmpfolder, ignore_errors=True)
+
 if __name__ == "__main__":
     main()
diff --git a/mlcubes/data_preparation/project/requirements.txt b/mlcubes/data_preparation/project/requirements.txt
@@ -1,10 +1,10 @@
-typer
-pandas
-PyYAML
+typer==0.9.0
+pandas==1.5.3
+PyYAML==6.0.1
 # Include all your requirements here
-SimpleITK
-tqdm
-scikit-image
+SimpleITK==2.3.1
+tqdm==4.66.2
+scikit-image==0.21.0
 FigureGenerator==0.0.4
 gandlf==0.0.16
 labelfusion==1.0.14

diff --git a/mlcubes/data_preparation/project/sanity_check.py b/mlcubes/data_preparation/project/sanity_check.py
@@ -14,9 +14,9 @@ def sanity_check(data_path: str, labels_path: str):
     """
     # Here you must add all the checks you consider important regarding the
     # state of the data
-    assert has_prepared_folder_structure(
-        data_path, labels_path
-    ), "The contents of the labels and data don't ressemble a prepared dataset"
+    if not has_prepared_folder_structure(data_path, labels_path):
+        print("The contents of the labels and data don't resemble a prepared dataset", flush=True)
+        exit(1)
 
 
 if __name__ == "__main__":

diff --git a/mlcubes/data_preparation/project/stages/comparison.py b/mlcubes/data_preparation/project/stages/comparison.py
@@ -106,6 +106,7 @@ def __report_success(
         return report
 
     def could_run(self, index: Union[str, int], report: DataFrame) -> bool:
+        print(f"Checking if {self.name} can run")
         # Ensure a single reviewed segmentation file exists
         path = self.__get_input_path(index)
         gt_path = self.__get_backup_path(index)
@@ -125,6 +126,7 @@ def could_run(self, index: Union[str, int], report: DataFrame) -> bool:
 
         prev_hash = report.loc[index]["segmentation_hash"]
         hash_changed = prev_hash != reviewed_hash
+        print(f"{path_exists=} and {contains_case=} and {gt_path_exists=} and {hash_changed=}")
         is_valid = path_exists and contains_case and gt_path_exists and hash_changed
 
         return is_valid

diff --git a/mlcubes/data_preparation/project/stages/confirm.py b/mlcubes/data_preparation/project/stages/confirm.py
@@ -127,13 +127,15 @@ def __process_row(self, row: pd.Series) -> pd.Series:
         return row
 
     def could_run(self, report: DataFrame) -> bool:
+        print(f"Checking if {self.name} can run")
         # could run once all cases have been compared to the ground truth
         missing_voxels = report["num_changed_voxels"].isnull().values.any()
         prev_path_exists = os.path.exists(self.prev_stage_path)
         empty_prev_path = True
         if prev_path_exists:
             empty_prev_path = len(os.listdir(self.prev_stage_path)) == 0
 
+        print(f"{prev_path_exists=} and not {empty_prev_path=} and not {missing_voxels=}")
         return prev_path_exists and not empty_prev_path and not missing_voxels
 
     def execute(self, report: DataFrame) -> Tuple[DataFrame, bool]:

diff --git a/mlcubes/data_preparation/project/stages/extract.py b/mlcubes/data_preparation/project/stages/extract.py
@@ -57,8 +57,11 @@ def could_run(self, index: Union[str, int], report: pd.DataFrame) -> bool:
         Returns:
             bool: Wether this stage could be executed for the given case
         """
+        print(f"Checking if {self.name} can run")
         prev_paths = self.__get_paths(index, self.prev_path, self.prev_subpath)
-        return all([os.path.exists(path) for path in prev_paths])
+        is_valid = all([os.path.exists(path) for path in prev_paths])
+        print(f"{is_valid=}")
+        return is_valid
 
     def execute(
         self, index: Union[str, int], report: pd.DataFrame
@@ -97,6 +100,7 @@ def __copy_case(self, index: Union[str, int]):
         prev_paths = self.__get_paths(index, self.prev_path, self.prev_subpath)
         copy_paths = self.__get_paths(index, self.out_path, self.prev_subpath)
         for prev, copy in zip(prev_paths, copy_paths):
+            shutil.rmtree(copy, ignore_errors=True)
             shutil.copytree(prev, copy, dirs_exist_ok=True)
 
     def _process_case(self, index: Union[str, int]):