Skip to content

Commit 1b0b73f

Browse files
authored
Merge pull request #472 from zivy/scriptImprovements
Script improvements
2 parents 05752f1 + db47101 commit 1b0b73f

File tree

1 file changed

+58
-34
lines changed

1 file changed

+58
-34
lines changed

Python/scripts/characterize_data.py

Lines changed: 58 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -373,34 +373,48 @@ def inspect_single_series(series_data, meta_data_info={}, thumbnail_settings={})
373373
reader = sitk.ImageSeriesReader()
374374
reader.MetaDataDictionaryArrayUpdateOn()
375375
reader.LoadPrivateTagsOn()
376-
# CHANGE to split list of tag values and sid is first entry
376+
# split list of tag values, sid is first entry (see inspect_series)
377377
sid = series_data[0].split(":")[0]
378378
file_names = series_info["files"]
379-
# As the files comprising a series with multiple files can reside in
380-
# separate directories and SimpleITK expects them to be in a single directory
379+
# As the files comprising a series can reside in separate directories
380+
# and may have identical file names (e.g. 1/Z0.dcm, 2/Z0.dcm)
381381
# we use a tempdir and symbolic links to enable SimpleITK to read the series as
382-
# a single image. Additionally, the files are renamed as they may have resided in
383-
# separate directories with the same file name. Finally, on Windows
384-
# we need to copy the files to the tempdir as the os.symlink documentation says that
382+
# a single image (ImageSeriesReader_GetGDCMSeriesFileNames expects all files to
383+
# be in a single directory).
384+
# On Windows we need to copy the files to the tempdir as the os.symlink documentation says that
385385
# "On newer versions of Windows 10, unprivileged accounts can create symlinks
386386
# if Developer Mode is enabled. When Developer Mode is not available/enabled,
387387
# the SeCreateSymbolicLinkPrivilege privilege is required, or the process must be
388388
# run as an administrator."
389389
# To turn Developer Mode on in Windows 11:
390390
# Settings->System->For Developers and turn Developer Mode on.
391-
# We could then comment out the Windows specific code below.
391+
# We could then use the os.symlink function instead of the indirect usage of a
392+
# copy_link_function below.
392393
with tempfile.TemporaryDirectory() as tmpdirname:
393-
if platform.system() == "Windows":
394-
for i, fname in enumerate(file_names):
395-
shutil.copy(
396-
os.path.abspath(fname), os.path.join(tmpdirname, str(i))
397-
)
398-
else:
399-
for i, fname in enumerate(file_names):
400-
os.symlink(os.path.abspath(fname), os.path.join(tmpdirname, str(i)))
401-
reader.SetFileNames(
402-
sitk.ImageSeriesReader_GetGDCMSeriesFileNames(tmpdirname, sid)
394+
copy_link_function = (
395+
shutil.copy if platform.system() == "Windows" else os.symlink
403396
)
397+
new_orig_file_name_dict = {}
398+
for i, fname in enumerate(file_names):
399+
new_fname = os.path.join(tmpdirname, str(i))
400+
new_orig_file_name_dict[new_fname] = fname
401+
copy_link_function(fname, new_fname)
402+
# For some reason on windows the returned full paths use double backslash
403+
# for all directories except the last one which has a slash. This does not
404+
# match the contents of the new_orig_file_name_dict which has a backslash
405+
# for the last entry too. In the code below we call os.path.normpath to
406+
# address this issue.
407+
sorted_new_file_names = sitk.ImageSeriesReader_GetGDCMSeriesFileNames(
408+
tmpdirname, sid
409+
)
410+
# store the file names in a sorted order so that they are saved in this
411+
# manner. This is useful for reading from the saved csv file
412+
# using the SeriesImageReader or ImageRead which expect ordered file names
413+
series_info["files"] = [
414+
new_orig_file_name_dict[os.path.normpath(new_fname)]
415+
for new_fname in sorted_new_file_names
416+
]
417+
reader.SetFileNames(sorted_new_file_names)
404418
img = reader.Execute()
405419
for k in meta_data_info.values():
406420
if reader.HasMetaDataKey(0, k):
@@ -441,14 +455,18 @@ def inspect_series(root_dir, series_tags, meta_data_info={}, thumbnail_settings=
441455
all_series_files = {}
442456
additional_series_tags = series_tags - {"0020|000e", "0020|000d"}
443457
reader = sitk.ImageFileReader()
458+
# explicitly set ImageIO to GDCMImageIO so that non DICOM files that
459+
# contain DICOM tags (file converted from original DICOM) will be
460+
# ignored.
461+
reader.SetImageIO("GDCMImageIO")
444462
# collect the file names of all series into a dictionary with the key being
445463
# study:series. This traversal is faster, O(n), than calling GetGDCMSeriesIDs on each
446464
# directory followed by iterating over the series and calling
447465
# GetGDCMSeriesFileNames with the seriesID on that directory, O(n^2).
448466
for dir_name, subdir_names, file_names in os.walk(root_dir):
449467
for file in file_names:
450468
try:
451-
fname = os.path.join(dir_name, file)
469+
fname = os.path.join(os.path.abspath(dir_name), file)
452470
reader.SetFileName(fname)
453471
reader.ReadImageInformation()
454472
sid = reader.GetMetaData("0020|000e")
@@ -723,27 +741,32 @@ def characterize_data(argv=None):
723741
of using os.walk to traverse the file system (internally os.walk uses os.scandir and that method's
724742
documentation says "The entries are yielded in arbitrary order.").
725743
726-
Convert from x, y, z (zero based) indexes from the "summary image" to information from "summary csv" file.
744+
Convert from x, y, z (zero based) indexes from the "summary image" to information from
745+
"summary csv" file. To view the summary image and obtain the x-y-z coordinates for a
746+
specific thumbnail we recommend using one of the following free programs:
747+
Fiji (uses zero based indexing): https://imagej.net/software/fiji/
748+
3D slicer (uses zero based indexing): https://www.slicer.org/
749+
ITK-SNAP (uses one based indexing, subtract one): http://www.itksnap.org/
750+
727751
728752
import pandas as pd
729753
import SimpleITK as sitk
730754
731-
def xyz_to_index(x, y, z):
732-
tile_size=[20, 20]
733-
thumbnail_size=[64, 64]
734-
# add 2 to the index because the csv starts counting lines at 1 and the first
735-
# line is the table header
755+
def xyz_to_index(x, y, z, thumbnail_size, tile_size):
736756
return (z * tile_size[0] * tile_size[1]
737757
+ int(y / thumbnail_size[1]) * tile_size[0]
738758
+ int(x / thumbnail_size[0])
739759
)
740760
741-
csv_file_name = "Output/generic_image_data_report.csv"
742-
df = pd.read_csv(csv_file_name)
761+
summary_csv_file_name =
762+
df = pd.read_csv(summary_csv_file_name)
763+
# Ensure dataframe matches the read images. If the report included files that
764+
# were not read (non-image or read failures) remove them.
765+
df.dropna(inplace=True, thresh=2)
743766
744-
file_names = eval(df["files"].iloc[xyz_to_index(x=xval, y=yval, z=zval)])
745-
print(file_names)
746-
sitk.Show(sitk.ReadImage(file_names))
767+
thumbnail_size =
768+
tile_size =
769+
print(df["files"].iloc[xyz_to_index(x, y, z, thumbnail_size, tile_size)])
747770
"""
748771
# Configure argument parser for commandline arguments and set default
749772
# values.
@@ -990,11 +1013,12 @@ def xyz_to_index(x, y, z):
9901013
df.to_csv(args.output_file, index=False)
9911014

9921015
# minimal analysis on the image information, detect image duplicates and plot the image size
993-
# distribution and distribution of min/max intensity values of scalar
994-
# images
995-
image_counts = (
996-
df["MD5 intensity hash"].dropna().value_counts().reset_index(name="count")
997-
)
1016+
# distribution and distribution of min/max intensity values of scalar images
1017+
# first drop the rows that correspond to problematic files if they weren't already dropped
1018+
# based on program settings
1019+
if not args.ignore_problems:
1020+
df.dropna(inplace=True, thresh=2)
1021+
image_counts = df["MD5 intensity hash"].value_counts().reset_index(name="count")
9981022
duplicates = df[
9991023
df["MD5 intensity hash"].isin(
10001024
image_counts[image_counts["count"] > 1]["MD5 intensity hash"]

0 commit comments

Comments
 (0)