@@ -38,32 +38,82 @@ def get_opt_parser():
38
38
39
39
Option ("-H" , "--header-fields" ,
40
40
dest = "header_fields" , default = 'all' ,
41
- help = "Header fields (comma separated) to be printed as well (if present)" ),
41
+ help = "Header fields (comma separated) to be printed as well"
42
+ " (if present)" ),
43
+
44
+ Option ("--ma" , "--data-max-abs-diff" ,
45
+ dest = "data_max_abs_diff" ,
46
+ type = float ,
47
+ default = 0.0 ,
48
+ help = "Maximal absolute difference in data between files"
49
+ " to tolerate." ),
50
+
51
+ Option ("--mr" , "--data-max-rel-diff" ,
52
+ dest = "data_max_rel_diff" ,
53
+ type = float ,
54
+ default = 0.0 ,
55
+ help = "Maximal relative difference in data between files to"
56
+ " tolerate. If --data-max-abs-diff is also specified,"
57
+ " only the data points with absolute difference greater"
58
+ " than that value would be considered for relative"
59
+ " difference check." ),
60
+ Option ("--dt" , "--datatype" ,
61
+ dest = "dtype" ,
62
+ default = np .float64 ,
63
+ help = "Enter a numpy datatype such as 'float32'." )
42
64
])
43
65
44
66
return p
45
67
46
68
47
69
def are_values_different (* values ):
48
- """Generically compares values, returns true if different"""
49
- value0 = values [0 ]
50
- values = values [1 :] # to ensure that the first value isn't compared with itself
51
-
52
- for value in values :
53
- try : # we sometimes don't want NaN values
54
- if np .any (np .isnan (value0 )) and np .any (np .isnan (value )): # if they're both NaN
55
- break
56
- elif np .any (np .isnan (value0 )) or np .any (np .isnan (value )): # if only 1 is NaN
57
- return True
70
+ """Generically compare values, return True if different
58
71
59
- except TypeError :
60
- pass
72
+ Note that comparison is targetting reporting of comparison of the headers
73
+ so has following specifics:
74
+ - even a difference in data types is considered a difference, i.e. 1 != 1.0
75
+ - nans are considered to be the "same", although generally nan != nan
76
+ """
77
+ value0 = values [0 ]
61
78
79
+ # to not recompute over again
80
+ if isinstance (value0 , np .ndarray ):
81
+ try :
82
+ # np.asarray for elderly numpys, e.g. 1.7.1 where for
83
+ # degenerate arrays (shape ()) it would return a pure scalar
84
+ value0_nans = np .asanyarray (np .isnan (value0 ))
85
+ value0_nonnans = np .asanyarray (np .logical_not (value0_nans ))
86
+ # if value0_nans.size == 1:
87
+ # import pdb; pdb.set_trace()
88
+ if not np .any (value0_nans ):
89
+ value0_nans = None
90
+ except TypeError as exc :
91
+ str_exc = str (exc )
92
+ # Not implemented in numpy 1.7.1
93
+ if "not supported" in str_exc or "ot implemented" in str_exc :
94
+ value0_nans = None
95
+ else :
96
+ raise
97
+
98
+ for value in values [1 :]:
62
99
if type (value0 ) != type (value ): # if types are different, then we consider them different
63
100
return True
64
101
elif isinstance (value0 , np .ndarray ):
65
- return np .any (value0 != value )
66
-
102
+ if value0 .dtype != value .dtype or \
103
+ value0 .shape != value .shape :
104
+ return True
105
+ # there might be nans and they need special treatment
106
+ if value0_nans is not None :
107
+ value_nans = np .isnan (value )
108
+ if np .any (value0_nans != value_nans ):
109
+ return True
110
+ if np .any (value0 [value0_nonnans ] != value [value0_nonnans ]):
111
+ return True
112
+ elif np .any (value0 != value ):
113
+ return True
114
+ elif value0 is np .nan :
115
+ if value is not np .nan :
116
+ return True
67
117
elif value0 != value :
68
118
return True
69
119
@@ -101,8 +151,8 @@ def get_headers_diff(file_headers, names=None):
101
151
return difference
102
152
103
153
104
- def get_data_diff (files ):
105
- """Get difference between md5 values
154
+ def get_data_hash_diff (files , dtype = np . float64 ):
155
+ """Get difference between md5 values of data
106
156
107
157
Parameters
108
158
----------
@@ -115,7 +165,7 @@ def get_data_diff(files):
115
165
"""
116
166
117
167
md5sums = [
118
- hashlib .md5 (np .ascontiguousarray (nib .load (f ).get_data (), dtype = np . float32 )).hexdigest ()
168
+ hashlib .md5 (np .ascontiguousarray (nib .load (f ).get_fdata ( dtype = dtype ) )).hexdigest ()
119
169
for f in files
120
170
]
121
171
@@ -125,6 +175,86 @@ def get_data_diff(files):
125
175
return md5sums
126
176
127
177
178
+ def get_data_diff (files , max_abs = 0 , max_rel = 0 , dtype = np .float64 ):
179
+ """Get difference between data
180
+
181
+ Parameters
182
+ ----------
183
+ files: list of (str or ndarray)
184
+ If list of strings is provided -- they must be existing file names
185
+ max_abs: float, optional
186
+ Maximal absolute difference to tolerate.
187
+ max_rel: float, optional
188
+ Maximal relative (`abs(diff)/mean(diff)`) difference to tolerate.
189
+ If `max_abs` is specified, then those data points with lesser than that
190
+ absolute difference, are not considered for relative difference testing
191
+ dtype: np, optional
192
+ Datatype to be used when extracting data from files
193
+
194
+ Returns
195
+ -------
196
+ diffs: OrderedDict
197
+ An ordered dict with a record per each file which has differences
198
+ with other files subsequent detected. Each record is a list of
199
+ difference records, one per each file pair.
200
+ Each difference record is an Ordered Dict with possible keys
201
+ 'abs' or 'rel' showing maximal absolute or relative differences
202
+ in the file or the record ('CMP': 'incompat') if file shapes
203
+ are incompatible.
204
+ """
205
+
206
+ # we are doomed to keep them in RAM now
207
+ data = [f if isinstance (f , np .ndarray ) else nib .load (f ).get_fdata (dtype = dtype )
208
+ for f in files ]
209
+ diffs = OrderedDict ()
210
+ for i , d1 in enumerate (data [:- 1 ]):
211
+ # populate empty entries for non-compared
212
+ diffs1 = [None ] * (i + 1 )
213
+
214
+ for j , d2 in enumerate (data [i + 1 :], i + 1 ):
215
+
216
+ if d1 .shape == d2 .shape :
217
+ abs_diff = np .abs (d1 - d2 )
218
+ mean_abs = (np .abs (d1 ) + np .abs (d2 )) * 0.5
219
+ candidates = np .logical_or (mean_abs != 0 , abs_diff != 0 )
220
+
221
+ if max_abs :
222
+ candidates [abs_diff <= max_abs ] = False
223
+
224
+ max_abs_diff = np .max (abs_diff )
225
+ if np .any (candidates ):
226
+ rel_diff = abs_diff [candidates ] / mean_abs [candidates ]
227
+ if max_rel :
228
+ sub_thr = rel_diff <= max_rel
229
+ # Since we operated on sub-selected values already, we need
230
+ # to plug them back in
231
+ candidates [
232
+ tuple ((indexes [sub_thr ] for indexes in np .where (candidates )))
233
+ ] = False
234
+ max_rel_diff = np .max (rel_diff )
235
+ else :
236
+ max_rel_diff = 0
237
+
238
+ if np .any (candidates ):
239
+
240
+ diff_rec = OrderedDict () # so that abs goes before relative
241
+
242
+ diff_rec ['abs' ] = max_abs_diff .astype (dtype )
243
+ diff_rec ['rel' ] = max_rel_diff .astype (dtype )
244
+ diffs1 .append (diff_rec )
245
+ else :
246
+ diffs1 .append (None )
247
+
248
+ else :
249
+ diffs1 .append ({'CMP' : "incompat" })
250
+
251
+ if any (diffs1 ):
252
+
253
+ diffs ['DATA(diff %d:)' % (i + 1 )] = diffs1
254
+
255
+ return diffs
256
+
257
+
128
258
def display_diff (files , diff ):
129
259
"""Format header differences into a nice string
130
260
@@ -140,21 +270,27 @@ def display_diff(files, diff):
140
270
"""
141
271
output = ""
142
272
field_width = "{:<15}"
273
+ filename_width = "{:<53}"
143
274
value_width = "{:<55}"
144
275
145
276
output += "These files are different.\n "
146
- output += field_width .format ('Field' )
277
+ output += field_width .format ('Field/File ' )
147
278
148
- for f in files :
149
- output += value_width .format (os .path .basename (f ))
279
+ for i , f in enumerate ( files , 1 ) :
280
+ output += "%d:%s" % ( i , filename_width .format (os .path .basename (f ) ))
150
281
151
282
output += "\n "
152
283
153
284
for key , value in diff .items ():
154
285
output += field_width .format (key )
155
286
156
287
for item in value :
157
- item_str = str (item )
288
+ if isinstance (item , dict ):
289
+ item_str = ', ' .join ('%s: %s' % i for i in item .items ())
290
+ elif item is None :
291
+ item_str = '-'
292
+ else :
293
+ item_str = str (item )
158
294
# Value might start/end with some invisible spacing characters so we
159
295
# would "condition" it on both ends a bit
160
296
item_str = re .sub ('^[ \t ]+' , '<' , item_str )
@@ -169,8 +305,40 @@ def display_diff(files, diff):
169
305
return output
170
306
171
307
308
+ def diff (files , header_fields = 'all' , data_max_abs_diff = None ,
309
+ data_max_rel_diff = None , dtype = np .float64 ):
310
+ assert len (files ) >= 2 , "Please enter at least two files"
311
+
312
+ file_headers = [nib .load (f ).header for f in files ]
313
+
314
+ # signals "all fields"
315
+ if header_fields == 'all' :
316
+ # TODO: header fields might vary across file types,
317
+ # thus prior sensing would be needed
318
+ header_fields = file_headers [0 ].keys ()
319
+ else :
320
+ header_fields = header_fields .split (',' )
321
+
322
+ diff = get_headers_diff (file_headers , header_fields )
323
+
324
+ data_md5_diffs = get_data_hash_diff (files , dtype )
325
+ if data_md5_diffs :
326
+ # provide details, possibly triggering the ignore of the difference
327
+ # in data
328
+ data_diffs = get_data_diff (files ,
329
+ max_abs = data_max_abs_diff ,
330
+ max_rel = data_max_rel_diff ,
331
+ dtype = dtype )
332
+ if data_diffs :
333
+ diff ['DATA(md5)' ] = data_md5_diffs
334
+ diff .update (data_diffs )
335
+
336
+ return diff
337
+
338
+
172
339
def main (args = None , out = None ):
173
340
"""Getting the show on the road"""
341
+
174
342
out = out or sys .stdout
175
343
parser = get_opt_parser ()
176
344
(opts , files ) = parser .parse_args (args )
@@ -181,27 +349,17 @@ def main(args=None, out=None):
181
349
# suppress nibabel format-compliance warnings
182
350
nib .imageglobals .logger .level = 50
183
351
184
- assert len (files ) >= 2 , "Please enter at least two files"
352
+ files_diff = diff (
353
+ files ,
354
+ header_fields = opts .header_fields ,
355
+ data_max_abs_diff = opts .data_max_abs_diff ,
356
+ data_max_rel_diff = opts .data_max_rel_diff ,
357
+ dtype = opts .dtype
358
+ )
185
359
186
- file_headers = [nib .load (f ).header for f in files ]
187
-
188
- # signals "all fields"
189
- if opts .header_fields == 'all' :
190
- # TODO: header fields might vary across file types, thus prior sensing would be needed
191
- header_fields = file_headers [0 ].keys ()
192
- else :
193
- header_fields = opts .header_fields .split (',' )
194
-
195
- diff = get_headers_diff (file_headers , header_fields )
196
- data_diff = get_data_diff (files )
197
-
198
- if data_diff :
199
- diff ['DATA(md5)' ] = data_diff
200
-
201
- if diff :
202
- out .write (display_diff (files , diff ))
360
+ if files_diff :
361
+ out .write (display_diff (files , files_diff ))
203
362
raise SystemExit (1 )
204
-
205
363
else :
206
364
out .write ("These files are identical.\n " )
207
365
raise SystemExit (0 )
0 commit comments