diff --git a/docs/source/_toctree.yml b/docs/source/_toctree.yml index cc6b7195fe2..08ba233ce8d 100644 --- a/docs/source/_toctree.yml +++ b/docs/source/_toctree.yml @@ -90,6 +90,8 @@ title: Create a document dataset - local: nifti_dataset title: Create a medical imaging dataset + - local: dicom_dataset + title: Create a medical dataset, containing images, signals or videos and additional metadata title: "Vision" - sections: - local: nlp_load diff --git a/docs/source/nifti_dataset.mdx b/docs/source/medical_imaging_dataset.mdx similarity index 51% rename from docs/source/nifti_dataset.mdx rename to docs/source/medical_imaging_dataset.mdx index 2770460fbaf..04fedcbb11a 100644 --- a/docs/source/nifti_dataset.mdx +++ b/docs/source/medical_imaging_dataset.mdx @@ -1,26 +1,34 @@ -# Create a NIfTI dataset +## Medical Imaging Dataset Guide -This page shows how to create and share a dataset of medical images in NIfTI format (.nii / .nii.gz) using the `datasets` library. +There are a couple of formats commonly used for medical imaging data, including DICOM and NIfTI. This guide covers how to create and share datasets in both formats using the `datasets` library. + +These are typically used for: + - NifTI: Storing MRI, fMRI, CT, PET scans in research settings. NifTI stands for Neuroimaging Informatics Technology Initiative. + - DICOM: Storing medical images in clinical settings, including metadata about patients and imaging procedures. DICOM stands for Digital Imaging and Communications in Medicine. + +### Create a NIfTI dataset + +This page shows how to create and share a dataset of medical images in NIfTI format (.nii / .nii.gz) or DICOM format (.dcm) using the `datasets` library. You can share a dataset with your team or with anyone in the community by creating a dataset repository on the Hugging Face Hub: ```py from datasets import load_dataset -dataset = load_dataset("/my_nifti_dataset") +dataset = load_dataset("/my_nifti_or_dicom_dataset") ``` -There are two common ways to create a NIfTI dataset: +There are two common ways to create a NIfTI or DICOM dataset: -- Create a dataset from local NIfTI files in Python and upload it with `Dataset.push_to_hub`. +- Create a dataset from local files in Python and upload it with `Dataset.push_to_hub`. - Use a folder-based convention (one file per example) and a small helper to convert it into a `Dataset`. > [!TIP] > You can control access to your dataset by requiring users to share their contact information first. Check out the [Gated datasets](https://huggingface.co/docs/hub/datasets-gated) guide for more information. -## Local files +### Local files -If you already have a list of file paths to NIfTI files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Nifti` feature. +If you already have a list of file paths to medical imaging files, the easiest workflow is to create a `Dataset` from that list and cast the column to the `Nifti` feature. ```py from datasets import Dataset @@ -35,7 +43,17 @@ ds = Dataset.from_dict({"nifti": files}).cast_column("nifti", Nifti()) # or a dict {'bytes': None, 'path': '...'} when decode=False ``` -The `Nifti` feature supports a `decode` parameter. When `decode=True` (the default), it loads the NIfTI file into a `nibabel.nifti1.Nifti1Image` object. You can access the image data as a numpy array with `img.get_fdata()`. When `decode=False`, it returns a dict with the file path and bytes. +For DICOM use: +```python +from datasets import Dataset, Dicom + +# simple example: create a dataset from file paths +files = ["/path/to/file_001.dcm", "/path/to/file_002.dcm"] +ds = Dataset.from_dict({"dicom": files}).cast_column("dicom", Dicom()) +``` + +The `Nifti` and `Dicom` feature support a `decode` parameter. When `decode=True` (the default), it loads the NIfTI file into a `nibabel.nifti1.Nifti1Image` object, and the DICOM file into a `pydicom.dataset.FileDataset` respectively. For NifTI files you can access the image data as a numpy array with `img.get_fdata()`. For DICOM files use `img.pixel_array`. +When `decode=False`, it returns a dict with the file path and bytes. ```py from datasets import Dataset, Nifti @@ -45,15 +63,23 @@ img = ds[0]["nifti"] # instance of: nibabel.nifti1.Nifti1Image arr = img.get_fdata() ``` +```python +from datasets import Dataset, Dicom + +ds = Dataset.from_dict({"dicom": ["/path/to/file_without_meta.dcm"]}).cast_column("dicom", Dicom(decode=True)) +img = ds[0]["dicom"] +arr = img.pixel_array +``` + After preparing the dataset you can push it to the Hub: ```py -ds.push_to_hub("/my_nifti_dataset") +ds.push_to_hub("/my_nifti_or_dicom_dataset") ``` -This will create a dataset repository containing your NIfTI dataset with a `data/` folder of parquet shards. +This will create a dataset repository containing your medical imaging dataset with a `data/` folder of parquet shards. -## Folder conventions and metadata +### Folder conventions and metadata If you organize your dataset in folders you can create splits automatically (train/test/validation) by following a structure like: @@ -64,7 +90,7 @@ dataset/validation/scan_1001.nii dataset/test/scan_2001.nii ``` -If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the NIfTI file next to the metadata file. +If you have labels or other metadata, provide a `metadata.csv`, `metadata.jsonl`, or `metadata.parquet` in the folder so files can be linked to metadata rows. The metadata must contain a `file_name` (or `*_file_name`) field with the relative path to the NIfTI/DICOM file next to the metadata file. Example `metadata.csv`: @@ -74,7 +100,7 @@ scan_0001.nii.gz,P001,45,healthy scan_0002.nii.gz,P002,59,disease_x ``` -The `Nifti` feature works with zipped datasets too — each zip can contain NIfTI files and a metadata file. This is useful when uploading large datasets as archives. +The `Nifti` feature works with zipped datasets too — each zip can contain NIfTI files and a metadata file. This is useful when uploading large datasets as archives. NOTE: This is not supported for DICOM files. This means your dataset structure could look like this (mixed compressed and uncompressed files): ``` dataset/train/scan_0001.nii.gz @@ -83,7 +109,7 @@ dataset/validation/scan_1001.nii.gz dataset/test/scan_2001.nii ``` -## Converting to PyTorch tensors +### Converting to PyTorch tensors Use the [`~Dataset.set_transform`] function to apply the transformation on-the-fly to batches of the dataset: @@ -99,10 +125,23 @@ def transform_to_pytorch(example): ds.set_transform(transform_to_pytorch) ``` -Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"nifti_torch"` key. + +```py +import torch +import pydicom +import numpy as np + +def transform_to_pytorch(example): + example["dicom_torch"] = [torch.tensor(ex.pixel_array) for ex in example["dicom"]] + return example + +ds.set_transform(transform_to_pytorch) + +``` +Accessing elements now (e.g. `ds[0]`) will yield torch tensors in the `"nifti_torch"/"dicom_torch"` key. -## Usage of NifTI1Image +### Usage of NifTI1Image NifTI is a format to store the result of 3 (or even 4) dimensional brain scans. This includes 3 spatial dimensions (x,y,z) and optionally a time dimension (t). Furthermore, the given positions here are only relative to the scanner, therefore @@ -127,4 +166,33 @@ for epi_img in nifti_ds: ``` For further reading we refer to the [nibabel documentation](https://nipy.org/nibabel/index.html) and especially [this nibabel tutorial](https://nipy.org/nibabel/coordinate_systems.html) + +### Usage of Pydicom + +The DICOM files are loaded using the [pydicom](https://pydicom.github.io/) library. Therefore, you can use all functionality of pydicom to access metadata and pixel data. + +```python +from datasets import load_dataset +dicom_ds = load_dataset("/my_dicom_dataset") +for dicom_img in dicom_ds: + dicom_object = dicom_img["dicom"] + print(dicom_object.PatientID) + print(dicom_object.StudyDate) + pixel_array = dicom_object.pixel_array + print(pixel_array.shape) +``` + +You can visualize the DICOM images using matplotlib as follows: + +```Python +import matplotlib.pyplot as plt +from datasets import load_dataset +dicom_ds = load_dataset("/my_dicom_dataset") +for dicom_img in dicom_ds: + dicom_object = dicom_img["dicom"] + plt.imshow(dicom_object.pixel_array, cmap=plt.cm.gray) + plt.show() +``` + +For further reading we refer to the [pydicom documentation](https://pydicom.github.io/pydicom/stable/) and [tutorials](https://pydicom.github.io/pydicom/stable/tutorials/index.html) --- diff --git a/docs/source/package_reference/loading_methods.mdx b/docs/source/package_reference/loading_methods.mdx index 4792d1b88f7..dbe840deeea 100644 --- a/docs/source/package_reference/loading_methods.mdx +++ b/docs/source/package_reference/loading_methods.mdx @@ -109,6 +109,12 @@ load_dataset("csv", data_dir="path/to/data/dir", sep="\t") [[autodoc]] datasets.packaged_modules.niftifolder.NiftiFolder +### Dicom + +[[autodoc]] datasets.packaged_modules.dicomfolder.DicomFolderConfig + +[[autodoc]] datasets.packaged_modules.dicomfolder.DicomFolder + ### WebDataset [[autodoc]] datasets.packaged_modules.webdataset.WebDataset diff --git a/docs/source/package_reference/main_classes.mdx b/docs/source/package_reference/main_classes.mdx index 84e651f9171..873a8c72029 100644 --- a/docs/source/package_reference/main_classes.mdx +++ b/docs/source/package_reference/main_classes.mdx @@ -275,6 +275,10 @@ Dictionary with split names as keys ('train', 'test' for example), and `Iterable [[autodoc]] datasets.Nifti +### Dicom + +[[autodoc]] datasets.Dicom + ## Filesystems [[autodoc]] datasets.filesystems.is_remote_filesystem diff --git a/setup.py b/setup.py index 2f626763113..b508197ef86 100644 --- a/setup.py +++ b/setup.py @@ -210,6 +210,8 @@ NIBABEL_REQUIRE = ["nibabel>=5.3.2"] +PYDICOM_REQUIRE = ["pydicom>=3.0.1"] + EXTRAS_REQUIRE = { "audio": AUDIO_REQUIRE, "vision": VISION_REQUIRE, @@ -228,6 +230,7 @@ "docs": DOCS_REQUIRE, "pdfs": PDFS_REQUIRE, "nibabel": NIBABEL_REQUIRE, + "pydicom": PYDICOM_REQUIRE, } setup( diff --git a/src/datasets/config.py b/src/datasets/config.py index b6412682727..1f081fdeddb 100644 --- a/src/datasets/config.py +++ b/src/datasets/config.py @@ -140,6 +140,7 @@ TORCHVISION_AVAILABLE = importlib.util.find_spec("torchvision") is not None PDFPLUMBER_AVAILABLE = importlib.util.find_spec("pdfplumber") is not None NIBABEL_AVAILABLE = importlib.util.find_spec("nibabel") is not None +PYDICOM_AVAILABLE = importlib.util.find_spec("pydicom") is not None # Optional compression tools RARFILE_AVAILABLE = importlib.util.find_spec("rarfile") is not None diff --git a/src/datasets/features/__init__.py b/src/datasets/features/__init__.py index 40a3568039a..bf8fad3b88f 100644 --- a/src/datasets/features/__init__.py +++ b/src/datasets/features/__init__.py @@ -16,8 +16,10 @@ "Video", "Pdf", "Nifti", + "Dicom", ] from .audio import Audio +from .dicom import Dicom from .features import Array2D, Array3D, Array4D, Array5D, ClassLabel, Features, LargeList, List, Sequence, Value from .image import Image from .nifti import Nifti diff --git a/src/datasets/features/dicom.py b/src/datasets/features/dicom.py new file mode 100644 index 00000000000..839fa4722a5 --- /dev/null +++ b/src/datasets/features/dicom.py @@ -0,0 +1,244 @@ +import os +from dataclasses import dataclass, field +from io import BytesIO +from pathlib import Path +from typing import TYPE_CHECKING, Any, ClassVar, Dict, Optional, TypedDict, Union + +import pyarrow as pa + +from .. import config +from ..download.download_config import DownloadConfig +from ..table import array_cast +from ..utils.file_utils import is_local_path, xopen +from ..utils.py_utils import string_to_dict + + +class DicomDict(TypedDict): + bytes: Optional[bytes] + path: Optional[str] + + +if TYPE_CHECKING: + import pydicom + + from .features import FeatureType + + +@dataclass +class Dicom: + """ + **Experimental.** + Dicom [`Feature`] to read DICOM medical imaging files. + + Input: The Dicom feature accepts as input: + - A `str`: Absolute path to the DICOM file (i.e. random access is allowed). + - A `pathlib.Path`: path to the DICOM file (i.e. random access is allowed). + - A `dict` with the keys: + - `path`: String with relative path of the DICOM file in a dataset repository. + - `bytes`: Bytes of the DICOM file. + This is useful for archived files with sequential access. + + - A `pydicom.FileDataset`: pydicom dataset object. + + Args: + decode (`bool`, defaults to `True`): + Whether to decode the DICOM data. If `False`, + returns the underlying dictionary in the format `{"path": dicom_path, "bytes": dicom_bytes}`. + force (`bool`, defaults to `False`): + Force reading files missing DICOM File Meta Information header or 'DICM' prefix. + Passed to `pydicom.dcmread(force=...)`. + + Examples: + + ```py + >>> from datasets import Dataset, Dicom + >>> ds = Dataset.from_dict({"dicom": ["path/to/file.dcm"]}).cast_column("dicom", Dicom()) + >>> ds.features["dicom"] + Dicom(decode=True, force=False, id=None) + >>> ds[0]["dicom"] + + >>> ds = ds.cast_column("dicom", Dicom(decode=False)) + >>> ds[0]["dicom"] + {'bytes': None, + 'path': 'path/to/file.dcm'} + ``` + """ + + decode: bool = True + force: bool = False + id: Optional[str] = field(default=None, repr=False) + + # Automatically constructed + dtype: ClassVar[str] = "pydicom.dataset.FileDataset" + pa_type: ClassVar[Any] = pa.struct({"bytes": pa.binary(), "path": pa.string()}) + _type: str = field(default="Dicom", init=False, repr=False) + + def __call__(self): + return self.pa_type + + def encode_example(self, value: Union[str, bytes, bytearray, dict, "pydicom.FileDataset"]) -> dict: + """Encode example into a format for Arrow. + + Args: + value (`str`, `bytes`, `pydicom.FileDataset` or `dict`): + Data passed as input to Dicom feature. + + Returns: + `dict` with "path" and "bytes" fields + """ + if config.PYDICOM_AVAILABLE: + import pydicom + else: + pydicom = None + + if isinstance(value, str): + return {"path": value, "bytes": None} + elif isinstance(value, Path): + return {"path": str(value.absolute()), "bytes": None} + elif isinstance(value, (bytes, bytearray)): + return {"path": None, "bytes": value} + elif pydicom is not None and isinstance(value, pydicom.dataset.FileDataset): + return encode_pydicom_dataset(value) + elif isinstance(value, dict): + if value.get("path") is not None and os.path.isfile(value["path"]): + return {"bytes": None, "path": value.get("path")} + elif value.get("bytes") is not None or value.get("path") is not None: + return {"bytes": value.get("bytes"), "path": value.get("path")} + else: + raise ValueError( + f"A dicom sample should have one of 'path' or 'bytes' but they are missing or None in {value}." + ) + else: + raise ValueError( + f"A dicom sample should be a string, bytes, Path, pydicom FileDataset, or dict, but got {type(value)}." + ) + + def decode_example( + self, value: DicomDict, token_per_repo_id: Optional[Dict[str, Union[str, bool]]] = None + ) -> "pydicom.FileDataset": + """Decode example DICOM file into pydicom FileDataset object. + + Args: + value (`dict`): + A dictionary with keys: + + - `path`: String with absolute or relative DICOM file path. + - `bytes`: The bytes of the DICOM file. + + token_per_repo_id (`dict`, *optional*): + To access and decode DICOM files from private repositories on + the Hub, you can pass a dictionary + repo_id (`str`) -> token (`bool` or `str`). + + Returns: + `pydicom.FileDataset` objects + """ + if not self.decode: + raise NotImplementedError("Decoding is disabled for this feature. Please use Dicom(decode=True) instead.") + + if config.PYDICOM_AVAILABLE: + import pydicom + else: + raise ImportError("To support decoding DICOM files, please install 'pydicom'.") + + if token_per_repo_id is None: + token_per_repo_id = {} + + path, bytes_ = value["path"], value["bytes"] + if bytes_ is None: + if path is None: + raise ValueError(f"A dicom should have one of 'path' or 'bytes' but both are None in {value}.") + else: + if is_local_path(path): + dicom = pydicom.dcmread(path, force=self.force) + else: + source_url = path.split("::")[-1] + pattern = ( + config.HUB_DATASETS_URL + if source_url.startswith(config.HF_ENDPOINT) + else config.HUB_DATASETS_HFFS_URL + ) + try: + repo_id = string_to_dict(source_url, pattern)["repo_id"] + token = token_per_repo_id.get(repo_id) + except ValueError: + token = None + download_config = DownloadConfig(token=token) + with xopen(path, "rb", download_config=download_config) as f: + dicom = pydicom.dcmread(f, force=self.force) + else: + bytesio = BytesIO(bytes_) + dicom = pydicom.dcmread(bytesio, force=self.force) + + return dicom + + def flatten(self) -> Union["FeatureType", Dict[str, "FeatureType"]]: + """If in the decodable state, return the feature itself, otherwise flatten the feature into a dictionary.""" + from .features import Value + + return ( + self + if self.decode + else { + "bytes": Value("binary"), + "path": Value("string"), + } + ) + + def cast_storage(self, storage: Union[pa.StringArray, pa.StructArray, pa.BinaryArray]) -> pa.StructArray: + """Cast an Arrow array to the Dicom arrow storage type. + The Arrow types that can be converted to the Dicom pyarrow storage type are: + + - `pa.string()` - it must contain the "path" data + - `pa.binary()` - it must contain the DICOM bytes + - `pa.struct({"bytes": pa.binary()})` + - `pa.struct({"path": pa.string()})` + - `pa.struct({"bytes": pa.binary(), "path": pa.string()})` - order doesn't matter + + Args: + storage (`Union[pa.StringArray, pa.StructArray, pa.BinaryArray]`): + PyArrow array to cast. + + Returns: + `pa.StructArray`: Array in the Dicom arrow storage type, that is + `pa.struct({"bytes": pa.binary(), "path": pa.string()})`. + """ + if pa.types.is_string(storage.type): + bytes_array = pa.array([None] * len(storage), type=pa.binary()) + storage = pa.StructArray.from_arrays([bytes_array, storage], ["bytes", "path"], mask=storage.is_null()) + elif pa.types.is_binary(storage.type): + path_array = pa.array([None] * len(storage), type=pa.string()) + storage = pa.StructArray.from_arrays([storage, path_array], ["bytes", "path"], mask=storage.is_null()) + elif pa.types.is_struct(storage.type): + if storage.type.get_field_index("bytes") >= 0: + bytes_array = storage.field("bytes") + else: + bytes_array = pa.array([None] * len(storage), type=pa.binary()) + if storage.type.get_field_index("path") >= 0: + path_array = storage.field("path") + else: + path_array = pa.array([None] * len(storage), type=pa.string()) + storage = pa.StructArray.from_arrays([bytes_array, path_array], ["bytes", "path"], mask=storage.is_null()) + return array_cast(storage, self.pa_type) + + +def encode_pydicom_dataset(dicom_ds: "pydicom.FileDataset") -> dict[str, Optional[Union[str, bytes]]]: + """ + Encode a pydicom FileDataset object into a dictionary. + + If the dataset has an associated file path, returns the path. Otherwise, serializes + the dataset content into bytes. + + Args: + dicom_ds: A pydicom FileDataset object. + + Returns: + dict: A dictionary with "path" or "bytes" field. + """ + if hasattr(dicom_ds, "filename") and dicom_ds.filename: + return {"path": dicom_ds.filename, "bytes": None} + + # Serialize to bytes + buffer = BytesIO() + dicom_ds.save_as(buffer, write_like_original=False) + return {"path": None, "bytes": buffer.getvalue()} diff --git a/src/datasets/features/features.py b/src/datasets/features/features.py index 88259767ae0..e706130b18a 100644 --- a/src/datasets/features/features.py +++ b/src/datasets/features/features.py @@ -41,6 +41,7 @@ from ..utils import experimental, logging from ..utils.py_utils import asdict, first_non_null_value, zip_dict from .audio import Audio +from .dicom import Dicom from .image import Image, encode_pil_image from .nifti import Nifti from .pdf import Pdf, encode_pdfplumber_pdf @@ -1272,6 +1273,7 @@ def __repr__(self): Video, Pdf, Nifti, + Dicom, ] @@ -1431,6 +1433,7 @@ def decode_nested_example(schema, obj, token_per_repo_id: Optional[dict[str, Uni Video.__name__: Video, Pdf.__name__: Pdf, Nifti.__name__: Nifti, + Dicom.__name__: Dicom, } @@ -1767,6 +1770,9 @@ class Features(dict): - [`Nifti`] feature to store the absolute path to a NIfTI neuroimaging file, a `nibabel.Nifti1Image` object or a dictionary with the relative path to a NIfTI file ("path" key) and its bytes content ("bytes" key). This feature loads the NIfTI file lazily with nibabel. + - [`Dicom`] feature to store the absolute path to a DICOM medical imaging file, a `pydicom.dataset.FileDataset` object + or a dictionary with the relative path to a DICOM file ("path" key) and its bytes content ("bytes" key). + This feature loads the DICOM file lazily with pydicom. - [`Translation`] or [`TranslationVariableLanguages`] feature specific to Machine Translation. """ diff --git a/src/datasets/packaged_modules/__init__.py b/src/datasets/packaged_modules/__init__.py index 9d076df44b7..f83d406228c 100644 --- a/src/datasets/packaged_modules/__init__.py +++ b/src/datasets/packaged_modules/__init__.py @@ -8,6 +8,7 @@ from .audiofolder import audiofolder from .cache import cache from .csv import csv +from .dicomfolder import dicomfolder from .hdf5 import hdf5 from .imagefolder import imagefolder from .json import json @@ -48,6 +49,7 @@ def _hash_python_lines(lines: list[str]) -> str: "videofolder": (videofolder.__name__, _hash_python_lines(inspect.getsource(videofolder).splitlines())), "pdffolder": (pdffolder.__name__, _hash_python_lines(inspect.getsource(pdffolder).splitlines())), "niftifolder": (niftifolder.__name__, _hash_python_lines(inspect.getsource(niftifolder).splitlines())), + "dicomfolder": (dicomfolder.__name__, _hash_python_lines(inspect.getsource(dicomfolder).splitlines())), "webdataset": (webdataset.__name__, _hash_python_lines(inspect.getsource(webdataset).splitlines())), "xml": (xml.__name__, _hash_python_lines(inspect.getsource(xml).splitlines())), "hdf5": (hdf5.__name__, _hash_python_lines(inspect.getsource(hdf5).splitlines())), @@ -93,6 +95,8 @@ def _hash_python_lines(lines: list[str]) -> str: _EXTENSION_TO_MODULE.update({ext.upper(): ("pdffolder", {}) for ext in pdffolder.PdfFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext: ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS}) _EXTENSION_TO_MODULE.update({ext.upper(): ("niftifolder", {}) for ext in niftifolder.NiftiFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext: ("dicomfolder", {}) for ext in dicomfolder.DicomFolder.EXTENSIONS}) +_EXTENSION_TO_MODULE.update({ext.upper(): ("dicomfolder", {}) for ext in dicomfolder.DicomFolder.EXTENSIONS}) # Used to filter data files based on extensions given a module name _MODULE_TO_EXTENSIONS: dict[str, list[str]] = {} @@ -111,3 +115,4 @@ def _hash_python_lines(lines: list[str]) -> str: _MODULE_TO_METADATA_FILE_NAMES["videofolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["pdffolder"] = imagefolder.ImageFolder.METADATA_FILENAMES _MODULE_TO_METADATA_FILE_NAMES["niftifolder"] = imagefolder.ImageFolder.METADATA_FILENAMES +_MODULE_TO_METADATA_FILE_NAMES["dicomfolder"] = imagefolder.ImageFolder.METADATA_FILENAMES diff --git a/src/datasets/packaged_modules/dicomfolder/__init__.py b/src/datasets/packaged_modules/dicomfolder/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/src/datasets/packaged_modules/dicomfolder/dicomfolder.py b/src/datasets/packaged_modules/dicomfolder/dicomfolder.py new file mode 100644 index 00000000000..5eb953cc545 --- /dev/null +++ b/src/datasets/packaged_modules/dicomfolder/dicomfolder.py @@ -0,0 +1,23 @@ +import datasets + +from ..folder_based_builder import folder_based_builder + + +logger = datasets.utils.logging.get_logger(__name__) + + +class DicomFolderConfig(folder_based_builder.FolderBasedBuilderConfig): + """BuilderConfig for DicomFolder.""" + + drop_labels: bool = None + drop_metadata: bool = None + + def __post_init__(self): + super().__post_init__() + + +class DicomFolder(folder_based_builder.FolderBasedBuilder): + BASE_FEATURE = datasets.Dicom + BASE_COLUMN_NAME = "dicom" + BUILDER_CONFIG_CLASS = DicomFolderConfig + EXTENSIONS: list[str] = [".dcm", ".dicom"] diff --git a/tests/features/data/test_dicom_693_J2KI.dcm b/tests/features/data/test_dicom_693_J2KI.dcm new file mode 100644 index 00000000000..adca15ec417 Binary files /dev/null and b/tests/features/data/test_dicom_693_J2KI.dcm differ diff --git a/tests/features/data/test_dicom_no_meta.dcm b/tests/features/data/test_dicom_no_meta.dcm new file mode 100644 index 00000000000..3eb05a6a53c Binary files /dev/null and b/tests/features/data/test_dicom_no_meta.dcm differ diff --git a/tests/features/test_dicom.py b/tests/features/test_dicom.py new file mode 100644 index 00000000000..577309da3fa --- /dev/null +++ b/tests/features/test_dicom.py @@ -0,0 +1,141 @@ +from pathlib import Path + +import pytest + +from datasets import Dataset, Dicom, Features +from src.datasets.features.dicom import encode_pydicom_dataset + +from ..utils import require_pydicom + + +@require_pydicom +@pytest.mark.parametrize( + "build_example", + [ + lambda dicom_path: dicom_path, + lambda dicom_path: Path(dicom_path), + lambda dicom_path: open(dicom_path, "rb").read(), + lambda dicom_path: {"path": dicom_path}, + lambda dicom_path: {"path": dicom_path, "bytes": None}, + lambda dicom_path: {"path": dicom_path, "bytes": open(dicom_path, "rb").read()}, + lambda dicom_path: {"path": None, "bytes": open(dicom_path, "rb").read()}, + lambda dicom_path: {"bytes": open(dicom_path, "rb").read()}, + ], +) +def test_dicom_feature_encode_example(tmp_path, build_example): + import pydicom + from pydicom import examples + + dicom_path = str(tmp_path / "test_example_dicom.dcm") + ds = examples.ct + ds.save_as(dicom_path, write_like_original=False) + + dicom = Dicom() + encoded_example = dicom.encode_example(build_example(dicom_path)) + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["bytes"] is not None or encoded_example["path"] is not None + decoded_example = dicom.decode_example(encoded_example) + assert isinstance(decoded_example, pydicom.dataset.FileDataset) + + +@require_pydicom +def test_dataset_with_dicom_feature(tmp_path): + import pydicom + from pydicom import examples + + dicom_path = str(tmp_path / "test_example_dicom.dcm") + ds = examples.mr + ds.save_as(dicom_path, write_like_original=False) + + data = {"dicom": [dicom_path]} + features = Features({"dicom": Dicom()}) + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"dicom"} + assert isinstance(item["dicom"], pydicom.dataset.FileDataset) + batch = dset[:1] + assert len(batch) == 1 + assert batch.keys() == {"dicom"} + assert isinstance(batch["dicom"], list) and all( + isinstance(item, pydicom.dataset.FileDataset) for item in batch["dicom"] + ) + column = dset["dicom"] + assert len(column) == 1 + assert all(isinstance(item, pydicom.dataset.FileDataset) for item in column) + + # from bytes + with open(dicom_path, "rb") as f: + data = {"dicom": [f.read()]} + dset = Dataset.from_dict(data, features=features) + item = dset[0] + assert item.keys() == {"dicom"} + assert isinstance(item["dicom"], pydicom.dataset.FileDataset) + + +@require_pydicom +def test_dataset_cast_dicom_column(shared_datadir): + """Test the example from the Dicom docstring using shared_datadir""" + import pydicom + + # File take from: https://github.com/robyoung/dicom-test-files/blob/master/data/pydicom/693_J2KI.dcm + dicom_path = str(shared_datadir / "test_dicom_693_J2KI.dcm") + + # decode=True (default) + ds = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom()) + assert ds.features["dicom"] == Dicom(decode=True, id=None) + assert isinstance(ds[0]["dicom"], pydicom.dataset.FileDataset) + + # decode=False + ds = ds.cast_column("dicom", Dicom(decode=False)) + assert ds.features["dicom"] == Dicom(decode=False, id=None) + decoded_item = ds[0]["dicom"] + assert isinstance(decoded_item, dict) + assert decoded_item.keys() == {"bytes", "path"} + assert decoded_item["path"] == dicom_path + assert decoded_item["bytes"] is None + + +@require_pydicom +def test_dicom_force_parameter(shared_datadir): + """Test loading DICOM file that requires force=True""" + import pydicom + + # File from: https://github.com/pydicom/pydicom/blob/main/src/pydicom/data/test_files/no_meta.dcm + # This file is missing DICOM File Meta Information header but can be read using force=True + dicom_path = str(shared_datadir / "test_dicom_no_meta.dcm") + + ds_no_force = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom(force=False)) + with pytest.raises(pydicom.errors.InvalidDicomError): + item = ds_no_force[0] + + ds_with_force = Dataset.from_dict({"dicom": [dicom_path]}).cast_column("dicom", Dicom(force=True)) + item = ds_with_force[0] + assert isinstance(item["dicom"], pydicom.dataset.FileDataset) + + +@require_pydicom +def test_encode_pydicom_dataset(tmp_path): + import pydicom + from pydicom import examples + + dicom_path = str(tmp_path / "test_example_dicom.dcm") + ds = examples.rt_ss + ds.save_as(dicom_path, write_like_original=False) + + img = pydicom.dcmread(dicom_path) + encoded_example = encode_pydicom_dataset(img) + dicom = Dicom() + assert isinstance(encoded_example, dict) + assert encoded_example.keys() == {"bytes", "path"} + assert encoded_example["path"] is not None and encoded_example["bytes"] is None + decoded_example = dicom.decode_example(encoded_example) + assert isinstance(decoded_example, pydicom.dataset.FileDataset) + + # test bytes only + img.filename = None + encoded_example_bytes = encode_pydicom_dataset(img) + assert encoded_example_bytes["bytes"] is not None + assert encoded_example_bytes["path"] is None + decoded_example_bytes = dicom.decode_example(encoded_example_bytes) + assert isinstance(decoded_example_bytes, pydicom.dataset.FileDataset) diff --git a/tests/utils.py b/tests/utils.py index 1980cf3e257..1a55654105a 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -235,6 +235,18 @@ def require_nibabel(test_case): return test_case +def require_pydicom(test_case): + """ + Decorator marking a test that requires pydicom. + + These tests are skipped when pydicom isn't installed. + + """ + if not config.PYDICOM_AVAILABLE: + test_case = unittest.skip("test requires pydicom")(test_case) + return test_case + + def require_transformers(test_case): """ Decorator marking a test that requires transformers.