3838import logging
3939import setup_GCToo_logger as setup_logger
4040import pandas as pd
41+ import numpy
4142
4243import GCToo
4344import parse
@@ -72,6 +73,8 @@ def build_parser():
7273 help = "what to name the output file" )
7374 parser .add_argument ("--fields_to_remove" , "-ftr" , nargs = "+" , default = [],
7475 help = "fields to remove from the common metadata" )
76+ parser .add_argument ("--remove_all_metadata_fields" , "-ramf" , action = "store_true" , default = False ,
77+ help = "remove all metadata fields during operation" )
7578 parser .add_argument ("--reset_ids" , "-rsi" , action = "store_true" , default = False ,
7679 help = "whether to reset ids (use this flag if ids are not unique)" )
7780
@@ -84,13 +87,18 @@ def build_parser():
8487 parser .add_argument ("-verbose" , "-v" , action = "store_true" , default = False ,
8588 help = "whether to print a bunch of output" )
8689
90+ parser .add_argument ("--error_report_output_file" , "-erof" , type = str , default = None ,
91+ help = """destination file for writing out error report - currently information about inconsistent
92+ metadata fields in the common dimension of the concat operation""" )
93+
8794 return parser
8895
8996
9097def main ():
9198 # get args
9299 args = build_parser ().parse_args (sys .argv [1 :])
93100 setup_logger .setup (verbose = args .verbose )
101+ logger .debug ("args: {}" .format (args ))
94102
95103 # Get files directly
96104 if args .input_filepaths is not None :
@@ -120,10 +128,12 @@ def main():
120128
121129 # Create concatenated gctoo object
122130 if args .concat_direction == "horiz" :
123- out_gctoo = hstack (gctoos , args .fields_to_remove , args .reset_ids )
131+ out_gctoo = hstack (gctoos , args .remove_all_metadata_fields , args .error_report_output_file ,
132+ args .fields_to_remove , args .reset_ids )
124133
125134 elif args .concat_direction == "vert" :
126- out_gctoo = vstack (gctoos , args .fields_to_remove , args .reset_ids )
135+ out_gctoo = vstack (gctoos , args .remove_all_metadata_fields , args .error_report_output_file ,
136+ args .fields_to_remove , args .reset_ids )
127137
128138 # Write out_gctoo to file
129139 logger .info ("Writing to output file args.out_name: {}" .format (args .out_name ))
@@ -153,7 +163,7 @@ def get_file_list(wildcard):
153163 return files
154164
155165
156- def hstack (gctoos , fields_to_remove = [], reset_ids = False ):
166+ def hstack (gctoos , remove_all_metadata_fields , error_report_file , fields_to_remove = [], reset_ids = False ):
157167 """ Horizontally concatenate gctoos.
158168
159169 Args:
@@ -169,18 +179,20 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
169179 row_meta_dfs = []
170180 col_meta_dfs = []
171181 data_dfs = []
182+ srcs = []
172183 for g in gctoos :
173184 row_meta_dfs .append (g .row_metadata_df )
174185 col_meta_dfs .append (g .col_metadata_df )
175186 data_dfs .append (g .data_df )
187+ srcs .append (g .src )
176188
177189 logger .debug ("shapes of row_meta_dfs: {}" .format ([x .shape for x in row_meta_dfs ]))
178190
179191 # Concatenate row metadata
180- all_row_metadata_df = assemble_common_meta (row_meta_dfs , fields_to_remove )
192+ all_row_metadata_df = assemble_common_meta (row_meta_dfs , fields_to_remove , srcs , remove_all_metadata_fields , error_report_file )
181193
182194 # Concatenate col metadata
183- all_col_metadata_df = assemble_concatenated_meta (col_meta_dfs )
195+ all_col_metadata_df = assemble_concatenated_meta (col_meta_dfs , remove_all_metadata_fields )
184196
185197 # Concatenate the data_dfs
186198 all_data_df = assemble_data (data_dfs , "horiz" )
@@ -202,7 +214,7 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
202214 return concated
203215
204216
205- def vstack (gctoos , fields_to_remove = [], reset_ids = False ):
217+ def vstack (gctoos , remove_all_metadata_fields , error_report_file , fields_to_remove = [], reset_ids = False ):
206218 """ Vertically concatenate gctoos.
207219
208220 Args:
@@ -218,16 +230,18 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
218230 row_meta_dfs = []
219231 col_meta_dfs = []
220232 data_dfs = []
233+ srcs = []
221234 for g in gctoos :
222235 row_meta_dfs .append (g .row_metadata_df )
223236 col_meta_dfs .append (g .col_metadata_df )
224237 data_dfs .append (g .data_df )
238+ srcs .append (g .src )
225239
226240 # Concatenate col metadata
227- all_col_metadata_df = assemble_common_meta (col_meta_dfs , fields_to_remove )
241+ all_col_metadata_df = assemble_common_meta (col_meta_dfs , fields_to_remove , srcs , remove_all_metadata_fields , error_report_file )
228242
229243 # Concatenate col metadata
230- all_row_metadata_df = assemble_concatenated_meta (row_meta_dfs )
244+ all_row_metadata_df = assemble_concatenated_meta (row_meta_dfs , remove_all_metadata_fields )
231245
232246 # Concatenate the data_dfs
233247 all_data_df = assemble_data (data_dfs , "vert" )
@@ -249,7 +263,7 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
249263 return concated
250264
251265
252- def assemble_common_meta (common_meta_dfs , fields_to_remove ):
266+ def assemble_common_meta (common_meta_dfs , fields_to_remove , sources , remove_all_metadata_fields , error_report_file ):
253267 """ Assemble the common metadata dfs together. Both indices are sorted.
254268 Fields that are not in all the dfs are dropped.
255269
@@ -262,50 +276,138 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove):
262276 all_meta_df_sorted (pandas df)
263277
264278 """
265- # Remove any column headers that are not present in all dfs (and sort)
266- shared_column_headers = sorted (set .intersection (* [set (df .columns ) for df in common_meta_dfs ]))
267- trimmed_common_meta_dfs = [df [shared_column_headers ] for df in common_meta_dfs ]
279+ all_meta_df , all_meta_df_with_dups = build_common_all_meta_df (common_meta_dfs , fields_to_remove , remove_all_metadata_fields )
280+
281+ if not all_meta_df .index .is_unique :
282+ all_report_df = build_mismatched_common_meta_report ([x .shape for x in common_meta_dfs ],
283+ sources , all_meta_df , all_meta_df_with_dups )
284+
285+ unique_duplicate_ids = all_report_df .index .unique ()
286+
287+ if error_report_file is not None :
288+ all_report_df .to_csv (error_report_file , sep = "\t " )
289+
290+ msg = """There are inconsistencies in common_metadata_df between different files. Try excluding metadata fields
291+ using the fields_to_remove argument. unique_duplicate_ids: {}
292+ all_report_df:
293+ {}""" .format (unique_duplicate_ids , all_report_df )
294+ raise MismatchCommonMetadataConcatGctooException (msg )
295+
296+ # Finally, sort the index
297+ all_meta_df_sorted = all_meta_df .sort_index (axis = 0 )
298+
299+ return all_meta_df_sorted
300+
301+
302+ def build_common_all_meta_df (common_meta_dfs , fields_to_remove , remove_all_metadata_fields ):
303+ """
304+ concatenate the entries in common_meta_dfs, removing columns selectively (fields_to_remove) or entirely (
305+ remove_all_metadata_fields=True; in this case, effectively just merges all the indexes in common_meta_dfs).
306+
307+ Returns 2 dataframes (in a tuple): the first has duplicates removed, the second does not.
308+
309+ Args:
310+ common_meta_dfs: collection of pandas DataFrames containing the metadata in the "common" direction of the
311+ concatenation operation
312+ fields_to_remove: columns to be removed (if present) from the common_meta_dfs
313+ remove_all_metadata_fields: boolean indicating that all metadata fields should be removed from the
314+ common_meta_dfs; overrides fields_to_remove if present
315+
316+ Returns:
317+ tuple containing
318+ all_meta_df: pandas dataframe that is the concatenation of the dataframes in common_meta_dfs,
319+ all_meta_df_with_dups:
320+ """
321+
322+ if remove_all_metadata_fields :
323+ trimmed_common_meta_dfs = [pd .DataFrame (index = df .index ) for df in common_meta_dfs ]
324+ else :
325+ shared_column_headers = sorted (set .intersection (* [set (df .columns ) for df in common_meta_dfs ]))
326+ logger .debug ("shared_column_headers: {}" .format (shared_column_headers ))
268327
269- # Remove any column headers that will prevent dfs from being identical
270- for df in trimmed_common_meta_dfs :
271- df .drop (fields_to_remove , axis = 1 , errors = "ignore" , inplace = True )
328+ trimmed_common_meta_dfs = [df [shared_column_headers ] for df in common_meta_dfs ]
329+
330+ # Remove any column headers that will prevent dfs from being identical
331+ for df in trimmed_common_meta_dfs :
332+ df .drop (fields_to_remove , axis = 1 , errors = "ignore" , inplace = True )
272333
273334 # Concatenate all dfs and then remove duplicate rows
274335 all_meta_df_with_dups = pd .concat (trimmed_common_meta_dfs , axis = 0 )
336+ logger .debug ("all_meta_df_with_dups.shape: {}" .format (all_meta_df_with_dups .shape ))
337+ logger .debug ("all_meta_df_with_dups.columns: {}" .format (all_meta_df_with_dups .columns ))
338+ logger .debug ("all_meta_df_with_dups.index: {}" .format (all_meta_df_with_dups .index ))
275339
276340 # If all metadata dfs were empty, df will be empty
277341 if all_meta_df_with_dups .empty :
278-
279342 # Simply return unique ids
280343 all_meta_df = pd .DataFrame (index = all_meta_df_with_dups .index .unique ())
281344
282345 else :
283346 all_meta_df_with_dups ["concat_gctoo_column_for_index" ] = all_meta_df_with_dups .index
284347 all_meta_df = all_meta_df_with_dups .copy (deep = True ).drop_duplicates ()
285348 all_meta_df .drop ("concat_gctoo_column_for_index" , axis = 1 , inplace = True )
349+ all_meta_df_with_dups .drop ("concat_gctoo_column_for_index" , axis = 1 , inplace = True )
286350
287351 logger .debug ("all_meta_df_with_dups.shape: {}" .format (all_meta_df_with_dups .shape ))
288352 logger .debug ("all_meta_df.shape: {}" .format (all_meta_df .shape ))
289353
290- # If there are still duplicate ids, then their metadata didn't align
291- # in different gcts
354+ return (all_meta_df , all_meta_df_with_dups )
355+
356+
357+ def build_mismatched_common_meta_report (common_meta_df_shapes , sources , all_meta_df , all_meta_df_with_dups ):
358+ """
359+ Generate a report (dataframe) that indicates for the common metadata that does not match across the common metadata
360+ which source file had which of the different mismatch values
361+
362+ Args:
363+ common_meta_df_shapes: list of tuples that are the shapes of the common meta dataframes
364+ sources: list of the source files that the dataframes were loaded from
365+ all_meta_df: produced from build_common_all_meta_df
366+ all_meta_df_with_dups: produced from build_common_all_meta_df
367+
368+ Returns:
369+ all_report_df: dataframe indicating the mismatched row metadata values and the corresponding source file
370+ """
371+ expanded_sources = []
372+ for (i , shape ) in enumerate (common_meta_df_shapes ):
373+ src = sources [i ]
374+ expanded_sources .extend ([src for i in xrange (shape [0 ])])
375+ expanded_sources = numpy .array (expanded_sources )
376+ logger .debug ("len(expanded_sources): {}" .format (len (expanded_sources )))
377+
292378 duplicate_ids = all_meta_df .index [all_meta_df .index .duplicated (keep = False )]
293379
294- assert all_meta_df .index .is_unique , (
295- ("There are inconsistencies in common_metadata_df between " +
296- "different files.\n Try excluding metadata fields " +
297- "using the fields_to_remove argument.\n "
298- "duplicate_ids[0]: {id}\n " +
299- "all_meta_df.loc[{id}, :]:\n {df}" ).format (id = duplicate_ids [0 ],
300- df = all_meta_df .loc [duplicate_ids [0 ], :]))
380+ unique_duplicate_ids = duplicate_ids .unique ()
381+ logger .debug ("unique_duplicate_ids: {}" .format (unique_duplicate_ids ))
301382
302- # Finally, sort the index
303- all_meta_df_sorted = all_meta_df .sort_index (axis = 0 )
383+ duplicate_ids_meta_df = all_meta_df .loc [unique_duplicate_ids ]
304384
305- return all_meta_df_sorted
385+ report_df_list = []
386+ for unique_dup_id in unique_duplicate_ids :
387+ rows = duplicate_ids_meta_df .loc [unique_dup_id ]
388+
389+ matching_row_locs = numpy .array ([False for i in xrange (all_meta_df_with_dups .shape [0 ])])
390+ for i in xrange (rows .shape [0 ]):
391+ r = rows .iloc [i ]
392+ row_comparison = r == all_meta_df_with_dups
393+ matching_row_locs = matching_row_locs | row_comparison .all (axis = 1 ).values
306394
395+ report_df = all_meta_df_with_dups .loc [matching_row_locs ].copy ()
396+ report_df ["source_file" ] = expanded_sources [matching_row_locs ]
397+ logger .debug ("report_df.shape: {}" .format (report_df .shape ))
398+ report_df_list .append (report_df )
307399
308- def assemble_concatenated_meta (concated_meta_dfs ):
400+ all_report_df = pd .concat (report_df_list , axis = 0 )
401+ all_report_df ["orig_rid" ] = all_report_df .index
402+ all_report_df .index = pd .Index (xrange (all_report_df .shape [0 ]), name = "index" )
403+ logger .debug ("all_report_df.shape: {}" .format (all_report_df .shape ))
404+ logger .debug ("all_report_df.index: {}" .format (all_report_df .index ))
405+ logger .debug ("all_report_df.columns: {}" .format (all_report_df .columns ))
406+
407+ return all_report_df
408+
409+
410+ def assemble_concatenated_meta (concated_meta_dfs , remove_all_metadata_fields ):
309411 """ Assemble the concatenated metadata dfs together. For example,
310412 if horizontally concatenating, the concatenated metadata dfs are the
311413 column metadata dfs. Both indices are sorted.
@@ -318,6 +420,10 @@ def assemble_concatenated_meta(concated_meta_dfs):
318420
319421 """
320422 # Concatenate the concated_meta_dfs
423+ if remove_all_metadata_fields :
424+ for df in concated_meta_dfs :
425+ df .drop (df .columns , axis = 1 , inplace = True )
426+
321427 all_concated_meta_df = pd .concat (concated_meta_dfs , axis = 0 )
322428
323429 # Sanity check: the number of rows in all_concated_meta_df should correspond
@@ -430,5 +536,8 @@ def reset_ids_in_meta_df(meta_df):
430536 meta_df .index .name = original_index_name
431537
432538
539+ class MismatchCommonMetadataConcatGctooException (Exception ):
540+ pass
541+
433542if __name__ == "__main__" :
434543 main ()
0 commit comments