@@ -373,34 +373,48 @@ def inspect_single_series(series_data, meta_data_info={}, thumbnail_settings={})
373373 reader = sitk .ImageSeriesReader ()
374374 reader .MetaDataDictionaryArrayUpdateOn ()
375375 reader .LoadPrivateTagsOn ()
376- # CHANGE to split list of tag values and sid is first entry
376+ # split list of tag values, sid is first entry (see inspect_series)
377377 sid = series_data [0 ].split (":" )[0 ]
378378 file_names = series_info ["files" ]
379- # As the files comprising a series with multiple files can reside in
380- # separate directories and SimpleITK expects them to be in a single directory
379+ # As the files comprising a series can reside in separate directories
380+ # and may have identical file names (e.g. 1/Z0.dcm, 2/Z0.dcm)
381381 # we use a tempdir and symbolic links to enable SimpleITK to read the series as
382- # a single image. Additionally, the files are renamed as they may have resided in
383- # separate directories with the same file name. Finally, on Windows
384- # we need to copy the files to the tempdir as the os.symlink documentation says that
382+ # a single image (ImageSeriesReader_GetGDCMSeriesFileNames expects all files to
383+ # be in a single directory).
384+ # On Windows we need to copy the files to the tempdir as the os.symlink documentation says that
385385 # "On newer versions of Windows 10, unprivileged accounts can create symlinks
386386 # if Developer Mode is enabled. When Developer Mode is not available/enabled,
387387 # the SeCreateSymbolicLinkPrivilege privilege is required, or the process must be
388388 # run as an administrator."
389389 # To turn Developer Mode on in Windows 11:
390390 # Settings->System->For Developers and turn Developer Mode on.
391- # We could then comment out the Windows specific code below.
391+ # We could then use the os.symlink function instead of the indirect usage of a
392+ # copy_link_function below.
392393 with tempfile .TemporaryDirectory () as tmpdirname :
393- if platform .system () == "Windows" :
394- for i , fname in enumerate (file_names ):
395- shutil .copy (
396- os .path .abspath (fname ), os .path .join (tmpdirname , str (i ))
397- )
398- else :
399- for i , fname in enumerate (file_names ):
400- os .symlink (os .path .abspath (fname ), os .path .join (tmpdirname , str (i )))
401- reader .SetFileNames (
402- sitk .ImageSeriesReader_GetGDCMSeriesFileNames (tmpdirname , sid )
394+ copy_link_function = (
395+ shutil .copy if platform .system () == "Windows" else os .symlink
403396 )
397+ new_orig_file_name_dict = {}
398+ for i , fname in enumerate (file_names ):
399+ new_fname = os .path .join (tmpdirname , str (i ))
400+ new_orig_file_name_dict [new_fname ] = fname
401+ copy_link_function (fname , new_fname )
402+ # For some reason on windows the returned full paths use double backslash
403+ # for all directories except the last one which has a slash. This does not
404+ # match the contents of the new_orig_file_name_dict which has a backslash
405+ # for the last entry too. In the code below we call os.path.normpath to
406+ # address this issue.
407+ sorted_new_file_names = sitk .ImageSeriesReader_GetGDCMSeriesFileNames (
408+ tmpdirname , sid
409+ )
410+ # store the file names in a sorted order so that they are saved in this
411+ # manner. This is useful for reading from the saved csv file
412+ # using the SeriesImageReader or ImageRead which expect ordered file names
413+ series_info ["files" ] = [
414+ new_orig_file_name_dict [os .path .normpath (new_fname )]
415+ for new_fname in sorted_new_file_names
416+ ]
417+ reader .SetFileNames (sorted_new_file_names )
404418 img = reader .Execute ()
405419 for k in meta_data_info .values ():
406420 if reader .HasMetaDataKey (0 , k ):
@@ -441,14 +455,18 @@ def inspect_series(root_dir, series_tags, meta_data_info={}, thumbnail_settings=
441455 all_series_files = {}
442456 additional_series_tags = series_tags - {"0020|000e" , "0020|000d" }
443457 reader = sitk .ImageFileReader ()
458+ # explicitly set ImageIO to GDCMImageIO so that non DICOM files that
459+ # contain DICOM tags (file converted from original DICOM) will be
460+ # ignored.
461+ reader .SetImageIO ("GDCMImageIO" )
444462 # collect the file names of all series into a dictionary with the key being
445463 # study:series. This traversal is faster, O(n), than calling GetGDCMSeriesIDs on each
446464 # directory followed by iterating over the series and calling
447465 # GetGDCMSeriesFileNames with the seriesID on that directory, O(n^2).
448466 for dir_name , subdir_names , file_names in os .walk (root_dir ):
449467 for file in file_names :
450468 try :
451- fname = os .path .join (dir_name , file )
469+ fname = os .path .join (os . path . abspath ( dir_name ) , file )
452470 reader .SetFileName (fname )
453471 reader .ReadImageInformation ()
454472 sid = reader .GetMetaData ("0020|000e" )
@@ -723,27 +741,32 @@ def characterize_data(argv=None):
723741 of using os.walk to traverse the file system (internally os.walk uses os.scandir and that method's
724742 documentation says "The entries are yielded in arbitrary order.").
725743
726- Convert from x, y, z (zero based) indexes from the "summary image" to information from "summary csv" file.
744+ Convert from x, y, z (zero based) indexes from the "summary image" to information from
745+ "summary csv" file. To view the summary image and obtain the x-y-z coordinates for a
746+ specific thumbnail we recommend using one of the following free programs:
747+ Fiji (uses zero based indexing): https://imagej.net/software/fiji/
748+ 3D slicer (uses zero based indexing): https://www.slicer.org/
749+ ITK-SNAP (uses one based indexing, subtract one): http://www.itksnap.org/
750+
727751
728752 import pandas as pd
729753 import SimpleITK as sitk
730754
731- def xyz_to_index(x, y, z):
732- tile_size=[20, 20]
733- thumbnail_size=[64, 64]
734- # add 2 to the index because the csv starts counting lines at 1 and the first
735- # line is the table header
755+ def xyz_to_index(x, y, z, thumbnail_size, tile_size):
736756 return (z * tile_size[0] * tile_size[1]
737757 + int(y / thumbnail_size[1]) * tile_size[0]
738758 + int(x / thumbnail_size[0])
739759 )
740760
741- csv_file_name = "Output/generic_image_data_report.csv"
742- df = pd.read_csv(csv_file_name)
761+ summary_csv_file_name =
762+ df = pd.read_csv(summary_csv_file_name)
763+ # Ensure dataframe matches the read images. If the report included files that
764+ # were not read (non-image or read failures) remove them.
765+ df.dropna(inplace=True, thresh=2)
743766
744- file_names = eval(df["files"].iloc[xyz_to_index(x=xval, y=yval, z=zval)])
745- print(file_names)
746- sitk.Show(sitk.ReadImage(file_names) )
767+ thumbnail_size =
768+ tile_size =
769+ print(df["files"].iloc[xyz_to_index(x, y, z, thumbnail_size, tile_size)] )
747770 """
748771 # Configure argument parser for commandline arguments and set default
749772 # values.
@@ -990,11 +1013,12 @@ def xyz_to_index(x, y, z):
9901013 df .to_csv (args .output_file , index = False )
9911014
9921015 # minimal analysis on the image information, detect image duplicates and plot the image size
993- # distribution and distribution of min/max intensity values of scalar
994- # images
995- image_counts = (
996- df ["MD5 intensity hash" ].dropna ().value_counts ().reset_index (name = "count" )
997- )
1016+ # distribution and distribution of min/max intensity values of scalar images
1017+ # first drop the rows that correspond to problematic files if they weren't already dropped
1018+ # based on program settings
1019+ if not args .ignore_problems :
1020+ df .dropna (inplace = True , thresh = 2 )
1021+ image_counts = df ["MD5 intensity hash" ].value_counts ().reset_index (name = "count" )
9981022 duplicates = df [
9991023 df ["MD5 intensity hash" ].isin (
10001024 image_counts [image_counts ["count" ] > 1 ]["MD5 intensity hash" ]
0 commit comments