diff --git a/README.md b/README.md index d17989b..3fc2ffb 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ Please see our [installation guide](./INSTALL.md) © 2025 Databricks, Inc. All rights reserved. The source in this project is provided subject to the Databricks License [https://databricks.com/db-license-source]. All included or referenced third party libraries are subject to the licenses set forth below. -| Package | Purpose | License | Source | -|---------|---------|---------|--------| -| pydicom | Python api for DICOM files | MIT | https://github.com/pydicom/pydicom | +| Datasource | Package | Purpose | License | Source | +| ---------- | ---------- | --------------------------------- | ----------- | ------------------------------------ | +| zipdcm | pydicom | Python api for DICOM files | MIT | https://github.com/pydicom/pydicom | +| zipdcm | pylibjpeg | Decoding / Encoding pixel formats | GPLv3 & MIT | https://github.com/pydicom/pylibjpeg | diff --git a/zipdcm/README.md b/zipdcm/README.md index f0fb046..1e36461 100644 --- a/zipdcm/README.md +++ b/zipdcm/README.md @@ -8,10 +8,15 @@ from dbx.zip_dcm_ds import ZipDCMDataSource spark.dataSource.register(ZipDCMDataSource) # read DCMs with `numPartitions` parallelism. -df = spark.read.format("zipdcm").option('numPartitions',4).load("./resources") +df = ( + spark.read + .format("zipdcm") + .option("numPartitions",4) + .load("./resources") +) df.display() ``` -For more, see our [demo]($./demo) notebook. +For more, see our [demo](./zip-dicom-demo.ipynb) notebook. ## Install @@ -38,3 +43,6 @@ Run unit tests ```bash make test ``` + +### Synthetic PHI data source citation +Rutherford, M. W., Nolan, T., Pei, L., Wagner, U., Pan, Q., Farmer, P., Smith, K., Kopchick, B., Laura Opsahl-Ong, Sutton, G., Clunie, D. A., Farahani, K., & Prior, F. (2025). Data in Support of the MIDI-B Challenge (MIDI-B-Synthetic-Validation, MIDI-B-Curated-Validation, MIDI-B-Synthetic-Test, MIDI-B-Curated-Test) (Version 1) [Dataset]. The Cancer Imaging Archive. https://doi.org/10.7937/CF2P-AW56 diff --git a/zipdcm/requirements.txt b/zipdcm/requirements.txt index 84130fb..075b521 100644 --- a/zipdcm/requirements.txt +++ b/zipdcm/requirements.txt @@ -1,2 +1,3 @@ pydicom==3.0.1 +pylibjpeg[all]>=2.0.1 pyspark==4.0.0.dev1 diff --git a/zipdcm/zip-dicom-demo.ipynb b/zipdcm/zip-dicom-demo.ipynb index 2031db6..04e78e9 100644 --- a/zipdcm/zip-dicom-demo.ipynb +++ b/zipdcm/zip-dicom-demo.ipynb @@ -19,11 +19,14 @@ "# Read Zipped DICOM files saving time and storage\n", "WIth the custom \"zipdcm\" Python Data Source, we can read zipped (and non Zipped) up DICOM files directly to extract their metadata.\n", "\n", - "Requirements:\n", - "- Recommend DBR 17.0 (Apache Spark 4.0) compute\n", + "### Requirements:\n", + "- Recommend DBR 17.1 (Apache Spark 4.0) dedicated compute\n", "- Shared cluster compute compatible\n", "- Working on serverless compute fix.\n", - "- Requires `pydicom==3.0.1`" + "- Requires `pydicom==3.0.1 pylibjpeg[all]>=2.0.1`\n", + "\n", + "### Synthetic PHI data source citation\n", + "Rutherford, M. W., Nolan, T., Pei, L., Wagner, U., Pan, Q., Farmer, P., Smith, K., Kopchick, B., Laura Opsahl-Ong, Sutton, G., Clunie, D. A., Farahani, K., & Prior, F. (2025). Data in Support of the MIDI-B Challenge (MIDI-B-Synthetic-Validation, MIDI-B-Curated-Validation, MIDI-B-Synthetic-Test, MIDI-B-Curated-Test) (Version 1) [Dataset]. The Cancer Imaging Archive. https://doi.org/10.7937/CF2P-AW56 " ] }, { @@ -53,7 +56,8 @@ } ], "source": [ - "%pip install --quiet pydicom==3.0.1\n", + "# %pip install --quiet numpy==1.26.4 pydicom==3.0.1 pylibjpeg[all]>=2.0.1\n", + "%pip install --quiet numpy==2.1.3 pydicom==3.0.1 pylibjpeg[all]>=2.0.1\n", "%restart_python" ] }, @@ -79,7 +83,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "total 57M\n-rwxrwxrwx 1 root root 12K Aug 1 21:09 1.3.199.1.2.3712432.1.402.1107814368275696879.zip\n-rwxrwxrwx 1 root root 24M Aug 1 21:09 3.5.574.1.3.9030958.6.376.1780887819048872979.zip\n-rwxrwxrwx 1 root root 12M Aug 1 21:09 3.5.574.1.3.9030958.6.376.2860280475000825621.zip\ndrwxrwxrwx 2 root root 4.0K Aug 2 17:33 x\n-rwxrwxrwx 1 root root 12M Aug 1 21:09 x.zip\ndrwxrwxrwx 2 root root 4.0K Aug 2 17:33 y\n-rwxrwxrwx 1 root root 12M Aug 1 21:09 y.zip\n" + "total 57M\n-rwxrwxrwx 1 root root 12K Aug 1 21:09 1.3.199.1.2.3712432.1.402.1107814368275696879.zip\n-rwxrwxrwx 1 root root 24M Aug 1 21:09 3.5.574.1.3.9030958.6.376.1780887819048872979.zip\n-rwxrwxrwx 1 root root 12M Aug 1 21:09 3.5.574.1.3.9030958.6.376.2860280475000825621.zip\ndrwxrwxrwx 2 root root 4.0K Aug 10 04:17 x\n-rwxrwxrwx 1 root root 12M Aug 1 21:09 x.zip\ndrwxrwxrwx 2 root root 4.0K Aug 10 04:17 y\n-rwxrwxrwx 1 root root 12M Aug 1 21:09 y.zip\n" ] } ], @@ -213,24 +217,35 @@ "spark.dataSource.register(ZipDCMDataSource)\n", "\n", "# read DCMs with `numPartitions` parallelism.\n", - "df = spark.read.format(\"zipdcm\").option('numPartitions',4).load(\"./resources\")\n", + "df = (\n", + " spark.read\n", + " .format(\"zipdcm\")\n", + " .option(\"numPartitions\",4)\n", + " .load(\"./resources\")\n", + ")\n", "df.display()" ] } ], "metadata": { "application/vnd.databricks.v1+notebook": { - "computePreferences": null, + "computePreferences": { + "hardware": { + "accelerator": null, + "gpuPoolId": null, + "memory": null + } + }, "dashboards": [], "environmentMetadata": { - "base_environment": "dbe_65bc13ea-276c-4905-a728-9fe2fb1780e2", + "base_environment": "", "environment_version": "2" }, "inputWidgetPreferences": null, "language": "python", "notebookMetadata": { "mostRecentlyExecutedCommandWithImplicitDF": { - "commandId": 7424973428825328, + "commandId": 5816783787054213, "dataframes": [ "_sqldf" ]