66import cmapPy .pandasGEXpress .setup_GCToo_logger as setup_logger
77import cmapPy .pandasGEXpress .concat_gctoo as cg
88import cmapPy .pandasGEXpress .parse_gct as pg
9+ import tempfile
910
1011
1112logger = logging .getLogger (setup_logger .LOGGER_NAME )
@@ -23,7 +24,7 @@ def test_left_right(self):
2324 expected_gct = pg .parse (expected_gct_path )
2425
2526 # Merge left and right
26- concated_gct = cg .hstack ([left_gct , right_gct ], [], False )
27+ concated_gct = cg .hstack ([left_gct , right_gct ], False , None , [], False )
2728
2829 pd .util .testing .assert_frame_equal (expected_gct .data_df , concated_gct .data_df , check_names = False )
2930 pd .util .testing .assert_frame_equal (expected_gct .row_metadata_df , concated_gct .row_metadata_df , check_names = False )
@@ -39,7 +40,7 @@ def test_top_bottom(self):
3940 expected_gct = pg .parse (expected_gct_path )
4041
4142 # Merge top and bottom
42- concated_gct = cg .vstack ([top_gct , bottom_gct ], [], False )
43+ concated_gct = cg .vstack ([top_gct , bottom_gct ], False , None , [], False )
4344
4445 pd .util .testing .assert_frame_equal (expected_gct .data_df , concated_gct .data_df , check_names = False )
4546 pd .util .testing .assert_frame_equal (expected_gct .row_metadata_df , concated_gct .row_metadata_df , check_names = False )
@@ -70,12 +71,21 @@ def test_assemble_common_meta(self):
7071 logger .debug ("meta2:\n {}" .format (meta2 ))
7172 logger .debug ("e_meta:\n {}" .format (e_meta1 ))
7273
74+ error_report_file = tempfile .NamedTemporaryFile ().name
75+ logger .debug ("rhd3 header needs to be removed - error_report_file: {}" .format (error_report_file ))
7376 with self .assertRaises (cg .MismatchCommonMetadataConcatGctooException ) as e :
74- cg .assemble_common_meta ([meta1 , meta2 ], [], ["my_src1" , "my_src2" ])
77+ cg .assemble_common_meta ([meta1 , meta2 ], [], ["my_src1" , "my_src2" ], False , error_report_file )
7578 self .assertIn ("r3" , str (e .exception ))
7679 logger .debug ("rhd3 header needs to be removed - e.exception: {}" .format (e .exception ))
80+ report_df = pd .read_csv (error_report_file , sep = "\t " )
81+ self .assertGreater (report_df .shape [0 ], 0 )
82+ self .assertGreater (report_df .shape [1 ], 0 )
83+ self .assertIn ("source_file" , report_df .columns )
84+ self .assertIn ("orig_rid" , report_df .columns )
85+ self .assertTrue (set (meta1 .columns ) < set (report_df .columns ))
7786
78- out_meta1 = cg .assemble_common_meta ([meta1 , meta2 ], ["rhd3" ], None )
87+
88+ out_meta1 = cg .assemble_common_meta ([meta1 , meta2 ], ["rhd3" ], None , False , None )
7989 logger .debug ("out_meta1:\n {}" .format (out_meta1 ))
8090 pd .util .testing .assert_frame_equal (out_meta1 , e_meta1 )
8191
@@ -95,7 +105,7 @@ def test_assemble_common_meta(self):
95105
96106 logger .debug ("meta3:\n {}" .format (meta3 ))
97107 logger .debug ("e_meta2:\n {}" .format (e_meta2 ))
98- out_meta2 = cg .assemble_common_meta ([meta1 , meta3 ], [], None )
108+ out_meta2 = cg .assemble_common_meta ([meta1 , meta3 ], [], None , False , None )
99109 pd .util .testing .assert_frame_equal (out_meta2 , e_meta2 )
100110
101111 # Some ids not present in both dfs
@@ -109,7 +119,7 @@ def test_assemble_common_meta(self):
109119 logger .debug ("meta4:\n {}" .format (meta4 ))
110120
111121 with self .assertRaises (cg .MismatchCommonMetadataConcatGctooException ) as e :
112- cg .assemble_common_meta ([meta1 , meta4 ], [], ["my_src1" , "my_src4" ])
122+ cg .assemble_common_meta ([meta1 , meta4 ], [], ["my_src1" , "my_src4" ], False , None )
113123 self .assertIn ("r1" , str (e .exception ))
114124
115125 def test_assemble_concatenated_meta (self ):
@@ -132,9 +142,17 @@ def test_assemble_concatenated_meta(self):
132142 logger .debug ("meta2:\n {}" .format (meta2 ))
133143 logger .debug ("e_concated:\n {}" .format (e_concated ))
134144
135- concated = cg .assemble_concatenated_meta ([meta2 , meta1 ])
145+ concated = cg .assemble_concatenated_meta ([meta2 , meta1 ], False )
146+ logger .debug ("happy path - concated:\n {}" .format (concated ))
136147 pd .util .testing .assert_frame_equal (e_concated , concated )
137148
149+ #remove all metadata
150+ r = cg .assemble_concatenated_meta ([meta2 , meta1 ], True )
151+ logger .debug ("remove all metadata - r:\n {}" .format (r ))
152+ self .assertEqual ((4 ,0 ), r .shape )
153+ self .assertTrue ((e_concated .index == r .index ).all ())
154+
155+
138156 def test_assemble_data (self ):
139157 # Horizontal concat
140158 df1 = pd .DataFrame (
@@ -220,13 +238,20 @@ def test_build_common_all_meta_dfs(self):
220238 index = ["r1" , "r2" , "r3" ],
221239 columns = ["rhd1" , "rhd2" ])
222240
223- r_all , r_all_w_dups = cg .build_common_all_meta_dfs ([meta1 , meta2 ], ["rhd3" ])
241+ r_all , r_all_w_dups = cg .build_common_all_meta_dfs ([meta1 , meta2 ], ["rhd3" ], False )
224242 logger .debug ("rhd3 header needs to be removed - r_all:\n {}" .format (r_all ))
225243 logger .debug ("r_all_w_dups:\n {}" .format (r_all_w_dups ))
226244 self .assertEqual ((3 ,2 ), r_all .shape )
227245 self .assertEqual ((6 ,2 ), r_all_w_dups .shape )
228246 pd .util .testing .assert_frame_equal (e_meta1 , r_all )
229247
248+ #remove all metadata fields
249+ r_all , r_all_w_dups = cg .build_common_all_meta_dfs ([meta1 , meta2 ], [], True )
250+ logger .debug ("remove all metadata fields - r_all\n {}" .format (r_all ))
251+ logger .debug ("r_all_w_dups:\n {}" .format (r_all_w_dups ))
252+ self .assertEqual ((3 ,0 ), r_all .shape )
253+ self .assertTrue ((e_meta1 .index == r_all .index ).all ())
254+
230255
231256 meta4 = pd .DataFrame (
232257 [["r1_1" , "r1_22" , "r1_5" ],
@@ -246,7 +271,7 @@ def test_build_common_all_meta_dfs(self):
246271
247272 # rhd5 not in meta4, so it should be dropped even without being
248273 # explicitly provided
249- out_meta3 = cg .assemble_common_meta ([meta1 , meta4 ], ["rhd2" ], None )
274+ out_meta3 , _ = cg .build_common_all_meta_dfs ([meta1 , meta4 ], ["rhd2" ], False )
250275 logger .debug ("""rhd5 not in meta4 so it should be automatically dropped without being
251276 explictly listed in fields_to_remove - out_meta3:
252277 {}""" .format (out_meta3 ))
@@ -255,14 +280,14 @@ def test_build_common_all_meta_dfs(self):
255280 # Empty metadata
256281 empty_meta = pd .DataFrame ([], index = ["a" , "b" , "c" ])
257282 logger .debug ("empty metadata provided - empty_meta.empty: {}" .format (empty_meta .empty ))
258- out_meta4 = cg .assemble_common_meta ([empty_meta , empty_meta ], [], None )
283+ out_meta4 , _ = cg .build_common_all_meta_dfs ([empty_meta , empty_meta ], [], False )
259284 logger .debug ("empty metadata provided - out_meta4:\n {}" .format (out_meta4 ))
260285 pd .util .testing .assert_frame_equal (out_meta4 , empty_meta )
261286
262287 #metadata has duplicates but index is unique
263288 meta5 = pd .DataFrame ({"rhd1" :[0 ,0 ,1 ]}, index = range (3 ))
264289 meta6 = pd .DataFrame ({"rhd1" :[0 ,0 ,1 ]}, index = range (3 ))
265- out_meta5 = cg .assemble_common_meta ([meta5 , meta6 ], [], None )
290+ out_meta5 , _ = cg .build_common_all_meta_dfs ([meta5 , meta6 ], [], False )
266291 logger .debug ("metadata has duplicates but index is unique - out_meta5:\n {}" .format (out_meta5 ))
267292 self .assertEqual ((3 ,1 ), out_meta5 .shape , "metadata contains duplicates but index is unique - should have been kept" )
268293
@@ -290,19 +315,20 @@ def test_build_mismatched_common_meta_report(self):
290315 logger .debug ("meta1:\n {}" .format (meta1 ))
291316 logger .debug ("meta2:\n {}" .format (meta2 ))
292317 logger .debug ("meta3:\n {}" .format (meta3 ))
293- # logger.debug("meta4:\n{}".format(meta4))
294318
295319 common_meta_dfs = [meta1 , meta2 , meta3 ]
296- all_meta_df , all_meta_df_with_dups = cg .build_common_all_meta_dfs (common_meta_dfs , [])
320+ all_meta_df , all_meta_df_with_dups = cg .build_common_all_meta_dfs (common_meta_dfs , [], False )
297321 common_meta_df_shapes = [x .shape for x in common_meta_dfs ]
298322 sources = ["my_src1" , "my_src2" , "my_src3" ]
299323 self .assertFalse (all_meta_df .index .is_unique , "during setup expected the index to not be unique" )
300324
301325 r = cg .build_mismatched_common_meta_report (common_meta_df_shapes , sources , all_meta_df , all_meta_df_with_dups )
302- logger .debug ("r: {}" .format (r ))
303- self .assertEqual ((3 , 4 ), r .shape )
326+ logger .debug ("r:\n {}" .format (r ))
327+ self .assertEqual ((3 , 5 ), r .shape )
304328 self .assertIn ("source_file" , r .columns )
305- self .assertTrue (all (r .index == "r3" ))
329+ self .assertIn ("orig_rid" , r .columns )
330+ self .assertTrue (set (meta1 .columns ) < set (r .columns ))
331+ self .assertEqual ({"r3" }, set (r .orig_rid ))
306332
307333
308334if __name__ == "__main__" :
0 commit comments