Improvements to charactarize_data script.

zivy · zivy · commit 1c32d25981ae · 2025-03-19T12:49:51.000-04:00
Explicitly specify GDCMImageIO for the analysis of DICOM series when
calling the ReadImageInformation. Otherwise non DICOM format files
that contain DICOM meta-data information (derived from DICOM) are
identified incorrectly as DICOM.

Use full path to series file names to match the file names used when
analysis is done per_file.
diff --git a/Python/scripts/characterize_data.py b/Python/scripts/characterize_data.py
@@ -441,14 +441,18 @@ def inspect_series(root_dir, series_tags, meta_data_info={}, thumbnail_settings=
     all_series_files = {}
     additional_series_tags = series_tags - {"0020|000e", "0020|000d"}
     reader = sitk.ImageFileReader()
+    # explicitly set ImageIO to GDCMImageIO so that non DICOM files that
+    # contain DICOM tags (file converted from original DICOM) will be
+    # ignored.
+    reader.SetImageIO("GDCMImageIO")
     # collect the file names of all series into a dictionary with the key being
     # study:series. This traversal is faster, O(n), than calling GetGDCMSeriesIDs on each
     # directory followed by iterating over the series and calling
     # GetGDCMSeriesFileNames with the seriesID on that directory, O(n^2).
     for dir_name, subdir_names, file_names in os.walk(root_dir):
         for file in file_names:
             try:
-                fname = os.path.join(dir_name, file)
+                fname = os.path.join(os.path.abspath(dir_name), file)
                 reader.SetFileName(fname)
                 reader.ReadImageInformation()
                 sid = reader.GetMetaData("0020|000e")
@@ -723,27 +727,32 @@ def characterize_data(argv=None):
     of using os.walk to traverse the file system (internally os.walk uses os.scandir and that method's
     documentation says "The entries are yielded in arbitrary order.").
 
-    Convert from x, y, z (zero based) indexes from the "summary image" to information from "summary csv" file.
+    Convert from x, y, z (zero based) indexes from the "summary image" to information from
+    "summary csv" file. To view the summary image and obtain the x-y-z coordinates for a
+    specific thumbnail we recommend using one of the following free programs:
+    Fiji (uses zero based indexing): https://imagej.net/software/fiji/
+    3D slicer (uses zero based indexing): https://www.slicer.org/
+    ITK-SNAP (uses one based indexing, subtract one): http://www.itksnap.org/
+
 
     import pandas as pd
     import SimpleITK as sitk
 
-    def xyz_to_index(x, y, z):
-        tile_size=[20, 20]
-        thumbnail_size=[64, 64]
-        # add 2 to the index because the csv starts counting lines at 1 and the first
-        # line is the table header
+    def xyz_to_index(x, y, z, thumbnail_size, tile_size):
         return (z * tile_size[0] * tile_size[1]
                 + int(y / thumbnail_size[1]) * tile_size[0]
                 + int(x / thumbnail_size[0])
                 )
 
-    csv_file_name = "Output/generic_image_data_report.csv"
-    df = pd.read_csv(csv_file_name)
+    summary_csv_file_name =
+    df = pd.read_csv(summary_csv_file_name)
+    # Ensure dataframe matches the read images. If the report included files that
+    # were not read (non-image or read failures) remove them.
+    df.dropna(inplace=True, thresh=2)
 
-    file_names = eval(df["files"].iloc[xyz_to_index(x=xval, y=yval, z=zval)])
-    print(file_names)
-    sitk.Show(sitk.ReadImage(file_names))
+    thumbnail_size =
+    tile_size =
+    print(df["files"].iloc[xyz_to_index(x, y, z, thumbnail_size, tile_size)])
     """
     # Configure argument parser for commandline arguments and set default
     # values.
@@ -990,11 +999,12 @@ def xyz_to_index(x, y, z):
     df.to_csv(args.output_file, index=False)
 
     # minimal analysis on the image information, detect image duplicates and plot the image size
-    # distribution and distribution of min/max intensity values of scalar
-    # images
-    image_counts = (
-        df["MD5 intensity hash"].dropna().value_counts().reset_index(name="count")
-    )
+    # distribution and distribution of min/max intensity values of scalar images
+    # first drop the rows that correspond to problematic files if they weren't already dropped
+    # based on program settings
+    if not args.ignore_problems:
+        df.dropna(inplace=True, thresh=2)
+    image_counts = df["MD5 intensity hash"].value_counts().reset_index(name="count")
     duplicates = df[
         df["MD5 intensity hash"].isin(
             image_counts[image_counts["count"] > 1]["MD5 intensity hash"]