Skip to content

Commit 1c32d25

Browse files
committed
Improvements to charactarize_data script.
Explicitly specify GDCMImageIO for the analysis of DICOM series when calling the ReadImageInformation. Otherwise non DICOM format files that contain DICOM meta-data information (derived from DICOM) are identified incorrectly as DICOM. Use full path to series file names to match the file names used when analysis is done per_file.
1 parent 05752f1 commit 1c32d25

File tree

1 file changed

+27
-17
lines changed

1 file changed

+27
-17
lines changed

Python/scripts/characterize_data.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -441,14 +441,18 @@ def inspect_series(root_dir, series_tags, meta_data_info={}, thumbnail_settings=
441441
all_series_files = {}
442442
additional_series_tags = series_tags - {"0020|000e", "0020|000d"}
443443
reader = sitk.ImageFileReader()
444+
# explicitly set ImageIO to GDCMImageIO so that non DICOM files that
445+
# contain DICOM tags (file converted from original DICOM) will be
446+
# ignored.
447+
reader.SetImageIO("GDCMImageIO")
444448
# collect the file names of all series into a dictionary with the key being
445449
# study:series. This traversal is faster, O(n), than calling GetGDCMSeriesIDs on each
446450
# directory followed by iterating over the series and calling
447451
# GetGDCMSeriesFileNames with the seriesID on that directory, O(n^2).
448452
for dir_name, subdir_names, file_names in os.walk(root_dir):
449453
for file in file_names:
450454
try:
451-
fname = os.path.join(dir_name, file)
455+
fname = os.path.join(os.path.abspath(dir_name), file)
452456
reader.SetFileName(fname)
453457
reader.ReadImageInformation()
454458
sid = reader.GetMetaData("0020|000e")
@@ -723,27 +727,32 @@ def characterize_data(argv=None):
723727
of using os.walk to traverse the file system (internally os.walk uses os.scandir and that method's
724728
documentation says "The entries are yielded in arbitrary order.").
725729
726-
Convert from x, y, z (zero based) indexes from the "summary image" to information from "summary csv" file.
730+
Convert from x, y, z (zero based) indexes from the "summary image" to information from
731+
"summary csv" file. To view the summary image and obtain the x-y-z coordinates for a
732+
specific thumbnail we recommend using one of the following free programs:
733+
Fiji (uses zero based indexing): https://imagej.net/software/fiji/
734+
3D slicer (uses zero based indexing): https://www.slicer.org/
735+
ITK-SNAP (uses one based indexing, subtract one): http://www.itksnap.org/
736+
727737
728738
import pandas as pd
729739
import SimpleITK as sitk
730740
731-
def xyz_to_index(x, y, z):
732-
tile_size=[20, 20]
733-
thumbnail_size=[64, 64]
734-
# add 2 to the index because the csv starts counting lines at 1 and the first
735-
# line is the table header
741+
def xyz_to_index(x, y, z, thumbnail_size, tile_size):
736742
return (z * tile_size[0] * tile_size[1]
737743
+ int(y / thumbnail_size[1]) * tile_size[0]
738744
+ int(x / thumbnail_size[0])
739745
)
740746
741-
csv_file_name = "Output/generic_image_data_report.csv"
742-
df = pd.read_csv(csv_file_name)
747+
summary_csv_file_name =
748+
df = pd.read_csv(summary_csv_file_name)
749+
# Ensure dataframe matches the read images. If the report included files that
750+
# were not read (non-image or read failures) remove them.
751+
df.dropna(inplace=True, thresh=2)
743752
744-
file_names = eval(df["files"].iloc[xyz_to_index(x=xval, y=yval, z=zval)])
745-
print(file_names)
746-
sitk.Show(sitk.ReadImage(file_names))
753+
thumbnail_size =
754+
tile_size =
755+
print(df["files"].iloc[xyz_to_index(x, y, z, thumbnail_size, tile_size)])
747756
"""
748757
# Configure argument parser for commandline arguments and set default
749758
# values.
@@ -990,11 +999,12 @@ def xyz_to_index(x, y, z):
990999
df.to_csv(args.output_file, index=False)
9911000

9921001
# minimal analysis on the image information, detect image duplicates and plot the image size
993-
# distribution and distribution of min/max intensity values of scalar
994-
# images
995-
image_counts = (
996-
df["MD5 intensity hash"].dropna().value_counts().reset_index(name="count")
997-
)
1002+
# distribution and distribution of min/max intensity values of scalar images
1003+
# first drop the rows that correspond to problematic files if they weren't already dropped
1004+
# based on program settings
1005+
if not args.ignore_problems:
1006+
df.dropna(inplace=True, thresh=2)
1007+
image_counts = df["MD5 intensity hash"].value_counts().reset_index(name="count")
9981008
duplicates = df[
9991009
df["MD5 intensity hash"].isin(
10001010
image_counts[image_counts["count"] > 1]["MD5 intensity hash"]

0 commit comments

Comments
 (0)