pandasGEXpress/concat_gctoo: add options to 1) ignore all metadata fields when concat'ing 2) print a full report when encountering an error of mismatched common metadata

Dave Lahr · Dave Lahr · commit 4f288b86d590 · 2017-10-19T12:26:17.000-04:00
diff --git a/cmapPy/pandasGEXpress/concat_gctoo.py b/cmapPy/pandasGEXpress/concat_gctoo.py
@@ -73,6 +73,8 @@ def build_parser():
         help="what to name the output file")
     parser.add_argument("--fields_to_remove", "-ftr", nargs="+", default=[],
         help="fields to remove from the common metadata")
+    parser.add_argument("--remove_all_metadata_fields", "-ramf", action="store_true", default=False,
+                        help="remove all metadata fields during operation")
     parser.add_argument("--reset_ids", "-rsi", action="store_true", default=False,
         help="whether to reset ids (use this flag if ids are not unique)")
 
@@ -85,6 +87,10 @@ def build_parser():
     parser.add_argument("-verbose", "-v", action="store_true", default=False,
         help="whether to print a bunch of output")
 
+    parser.add_argument("--error_report_output_file", "-erof", type=str, default="concat_gctoo_errors.txt",
+                        help="""destination file for writing out error report - currently information about inconsistent
+                        metadata fields in the common dimension of the concat operation""")
+
     return parser
 
 
@@ -154,7 +160,7 @@ def get_file_list(wildcard):
     return files
 
 
-def hstack(gctoos, fields_to_remove=[], reset_ids=False):
+def hstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
     """ Horizontally concatenate gctoos.
 
     Args:
@@ -180,10 +186,10 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
     logger.debug("shapes of row_meta_dfs:  {}".format([x.shape for x in row_meta_dfs]))
 
     # Concatenate row metadata
-    all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs)
+    all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
 
     # Concatenate col metadata
-    all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs)
+    all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs, remove_all_metadata_fields)
 
     # Concatenate the data_dfs
     all_data_df = assemble_data(data_dfs, "horiz")
@@ -205,7 +211,7 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
     return concated
 
 
-def vstack(gctoos, fields_to_remove=[], reset_ids=False):
+def vstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
     """ Vertically concatenate gctoos.
 
     Args:
@@ -229,10 +235,10 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
         srcs.append(g.src)
 
     # Concatenate col metadata
-    all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs)
+    all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
 
     # Concatenate col metadata
-    all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs)
+    all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs, remove_all_metadata_fields)
 
     # Concatenate the data_dfs
     all_data_df = assemble_data(data_dfs, "vert")
@@ -254,7 +260,7 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
     return concated
 
 
-def assemble_common_meta(common_meta_dfs, fields_to_remove, sources):
+def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_metadata_fields, error_report_file):
     """ Assemble the common metadata dfs together. Both indices are sorted.
     Fields that are not in all the dfs are dropped.
 
@@ -267,14 +273,17 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove, sources):
         all_meta_df_sorted (pandas df)
 
     """
-    all_meta_df, all_meta_df_with_dups = build_common_all_meta_dfs(common_meta_dfs, fields_to_remove)
+    all_meta_df, all_meta_df_with_dups = build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)
 
     if not all_meta_df.index.is_unique:
         all_report_df = build_mismatched_common_meta_report([x.shape for x in common_meta_dfs],
             sources, all_meta_df, all_meta_df_with_dups)
 
         unique_duplicate_ids = all_report_df.index.unique()
 
+        if error_report_file is not None:
+            all_report_df.to_csv(error_report_file, sep="\t")
+
         msg = """There are inconsistencies in common_metadata_df between different files.  Try excluding metadata fields
 using the fields_to_remove argument.  unique_duplicate_ids: {}
 all_report_df:
@@ -287,16 +296,20 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove, sources):
     return all_meta_df_sorted
 
 
-def build_common_all_meta_dfs(common_meta_dfs, fields_to_remove):
+def build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
     # Remove any column headers that are not present in all dfs (and sort)
-    shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
-    logger.debug("shared_column_headers:  {}".format(shared_column_headers))
 
-    trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
+    if remove_all_metadata_fields:
+        trimmed_common_meta_dfs = [pd.DataFrame(index=df.index) for df in common_meta_dfs]
+    else:
+        shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
+        logger.debug("shared_column_headers:  {}".format(shared_column_headers))
 
-    # Remove any column headers that will prevent dfs from being identical
-    for df in trimmed_common_meta_dfs:
-        df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
+        trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
+
+        # Remove any column headers that will prevent dfs from being identical
+        for df in trimmed_common_meta_dfs:
+            df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
 
     # Concatenate all dfs and then remove duplicate rows
     all_meta_df_with_dups = pd.concat(trimmed_common_meta_dfs, axis=0)
@@ -352,11 +365,16 @@ def build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta
         report_df_list.append(report_df)
 
     all_report_df = pd.concat(report_df_list, axis=0)
+    all_report_df["orig_rid"] = all_report_df.index
+    all_report_df.index = pd.Index(xrange(all_report_df.shape[0]), name="index")
+    logger.debug("all_report_df.shape:  {}".format(all_report_df.shape))
+    logger.debug("all_report_df.index:  {}".format(all_report_df.index))
+    logger.debug("all_report_df.columns:  {}".format(all_report_df.columns))
 
     return all_report_df
 
 
-def assemble_concatenated_meta(concated_meta_dfs):
+def assemble_concatenated_meta(concated_meta_dfs, remove_all_metadata_fields):
     """ Assemble the concatenated metadata dfs together. For example,
     if horizontally concatenating, the concatenated metadata dfs are the
     column metadata dfs. Both indices are sorted.
@@ -369,6 +387,10 @@ def assemble_concatenated_meta(concated_meta_dfs):
 
     """
     # Concatenate the concated_meta_dfs
+    if remove_all_metadata_fields:
+        for df in concated_meta_dfs:
+            df.drop(df.columns, axis=1, inplace=True)
+
     all_concated_meta_df = pd.concat(concated_meta_dfs, axis=0)
 
     # Sanity check: the number of rows in all_concated_meta_df should correspond
diff --git a/cmapPy/pandasGEXpress/tests/test_concat_gctoo.py b/cmapPy/pandasGEXpress/tests/test_concat_gctoo.py
@@ -6,6 +6,7 @@
 import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 import cmapPy.pandasGEXpress.concat_gctoo as cg
 import cmapPy.pandasGEXpress.parse_gct as pg
+import tempfile
 
 
 logger = logging.getLogger(setup_logger.LOGGER_NAME)
@@ -23,7 +24,7 @@ def test_left_right(self):
         expected_gct = pg.parse(expected_gct_path)
 
         # Merge left and right
-        concated_gct = cg.hstack([left_gct, right_gct], [], False)
+        concated_gct = cg.hstack([left_gct, right_gct], False, None, [], False)
 
         pd.util.testing.assert_frame_equal(expected_gct.data_df, concated_gct.data_df, check_names=False)
         pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, concated_gct.row_metadata_df, check_names=False)
@@ -39,7 +40,7 @@ def test_top_bottom(self):
         expected_gct = pg.parse(expected_gct_path)
 
         # Merge top and bottom
-        concated_gct = cg.vstack([top_gct, bottom_gct], [], False)
+        concated_gct = cg.vstack([top_gct, bottom_gct], False, None, [], False)
 
         pd.util.testing.assert_frame_equal(expected_gct.data_df, concated_gct.data_df, check_names=False)
         pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, concated_gct.row_metadata_df, check_names=False)
@@ -70,12 +71,21 @@ def test_assemble_common_meta(self):
         logger.debug("meta2:\n{}".format(meta2))
         logger.debug("e_meta:\n{}".format(e_meta1))
 
+        error_report_file = tempfile.NamedTemporaryFile().name
+        logger.debug("rhd3 header needs to be removed - error_report_file:  {}".format(error_report_file))
         with self.assertRaises(cg.MismatchCommonMetadataConcatGctooException) as e:
-            cg.assemble_common_meta([meta1, meta2], [], ["my_src1", "my_src2"])
+            cg.assemble_common_meta([meta1, meta2], [], ["my_src1", "my_src2"], False, error_report_file)
         self.assertIn("r3", str(e.exception))
         logger.debug("rhd3 header needs to be removed - e.exception:  {}".format(e.exception))
+        report_df = pd.read_csv(error_report_file, sep="\t")
+        self.assertGreater(report_df.shape[0], 0)
+        self.assertGreater(report_df.shape[1], 0)
+        self.assertIn("source_file", report_df.columns)
+        self.assertIn("orig_rid", report_df.columns)
+        self.assertTrue(set(meta1.columns) < set(report_df.columns))
 
-        out_meta1 = cg.assemble_common_meta([meta1, meta2], ["rhd3"], None)
+
+        out_meta1 = cg.assemble_common_meta([meta1, meta2], ["rhd3"], None, False, None)
         logger.debug("out_meta1:\n{}".format(out_meta1))
         pd.util.testing.assert_frame_equal(out_meta1, e_meta1)
 
@@ -95,7 +105,7 @@ def test_assemble_common_meta(self):
 
         logger.debug("meta3:\n{}".format(meta3))
         logger.debug("e_meta2:\n{}".format(e_meta2))
-        out_meta2 = cg.assemble_common_meta([meta1, meta3], [], None)
+        out_meta2 = cg.assemble_common_meta([meta1, meta3], [], None, False, None)
         pd.util.testing.assert_frame_equal(out_meta2, e_meta2)
 
         # Some ids not present in both dfs
@@ -109,7 +119,7 @@ def test_assemble_common_meta(self):
         logger.debug("meta4:\n{}".format(meta4))
 
         with self.assertRaises(cg.MismatchCommonMetadataConcatGctooException) as e:
-            cg.assemble_common_meta([meta1, meta4], [], ["my_src1", "my_src4"])
+            cg.assemble_common_meta([meta1, meta4], [], ["my_src1", "my_src4"], False, None)
         self.assertIn("r1", str(e.exception))
 
     def test_assemble_concatenated_meta(self):
@@ -132,9 +142,17 @@ def test_assemble_concatenated_meta(self):
         logger.debug("meta2:\n{}".format(meta2))
         logger.debug("e_concated:\n{}".format(e_concated))
 
-        concated = cg.assemble_concatenated_meta([meta2, meta1])
+        concated = cg.assemble_concatenated_meta([meta2, meta1], False)
+        logger.debug("happy path - concated:\n{}".format(concated))
         pd.util.testing.assert_frame_equal(e_concated, concated)
 
+        #remove all metadata
+        r = cg.assemble_concatenated_meta([meta2, meta1], True)
+        logger.debug("remove all metadata - r:\n{}".format(r))
+        self.assertEqual((4,0), r.shape)
+        self.assertTrue((e_concated.index == r.index).all())
+
+
     def test_assemble_data(self):
         # Horizontal concat
         df1 = pd.DataFrame(
@@ -220,13 +238,20 @@ def test_build_common_all_meta_dfs(self):
             index=["r1", "r2", "r3"],
             columns=["rhd1", "rhd2"])
 
-        r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], ["rhd3"])
+        r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], ["rhd3"], False)
         logger.debug("rhd3 header needs to be removed - r_all:\n{}".format(r_all))
         logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
         self.assertEqual((3,2), r_all.shape)
         self.assertEqual((6,2), r_all_w_dups.shape)
         pd.util.testing.assert_frame_equal(e_meta1, r_all)
 
+        #remove all metadata fields
+        r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], [], True)
+        logger.debug("remove all metadata fields - r_all\n{}".format(r_all))
+        logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
+        self.assertEqual((3,0), r_all.shape)
+        self.assertTrue((e_meta1.index == r_all.index).all())
+
 
         meta4 = pd.DataFrame(
             [["r1_1", "r1_22", "r1_5"],
@@ -246,7 +271,7 @@ def test_build_common_all_meta_dfs(self):
 
         # rhd5 not in meta4, so it should be dropped even without being
         # explicitly provided
-        out_meta3 = cg.assemble_common_meta([meta1, meta4], ["rhd2"], None)
+        out_meta3, _ = cg.build_common_all_meta_dfs([meta1, meta4], ["rhd2"], False)
         logger.debug("""rhd5 not in meta4 so it should be automatically dropped without being
         explictly listed in fields_to_remove - out_meta3:
         {}""".format(out_meta3))
@@ -255,14 +280,14 @@ def test_build_common_all_meta_dfs(self):
         # Empty metadata
         empty_meta = pd.DataFrame([], index=["a", "b", "c"])
         logger.debug("empty metadata provided - empty_meta.empty: {}".format(empty_meta.empty))
-        out_meta4 = cg.assemble_common_meta([empty_meta, empty_meta], [], None)
+        out_meta4, _ = cg.build_common_all_meta_dfs([empty_meta, empty_meta], [], False)
         logger.debug("empty metadata provided - out_meta4:\n{}".format(out_meta4))
         pd.util.testing.assert_frame_equal(out_meta4, empty_meta)
 
         #metadata has duplicates but index is unique
         meta5 = pd.DataFrame({"rhd1":[0,0,1]}, index=range(3))
         meta6 = pd.DataFrame({"rhd1":[0,0,1]}, index=range(3))
-        out_meta5 = cg.assemble_common_meta([meta5, meta6], [], None)
+        out_meta5, _ = cg.build_common_all_meta_dfs([meta5, meta6], [], False)
         logger.debug("metadata has duplicates but index is unique - out_meta5:\n{}".format(out_meta5))
         self.assertEqual((3,1), out_meta5.shape, "metadata contains duplicates but index is unique - should have been kept")
 
@@ -290,19 +315,20 @@ def test_build_mismatched_common_meta_report(self):
         logger.debug("meta1:\n{}".format(meta1))
         logger.debug("meta2:\n{}".format(meta2))
         logger.debug("meta3:\n{}".format(meta3))
-        # logger.debug("meta4:\n{}".format(meta4))
 
         common_meta_dfs = [meta1, meta2, meta3]
-        all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_dfs(common_meta_dfs, [])
+        all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_dfs(common_meta_dfs, [], False)
         common_meta_df_shapes = [x.shape for x in common_meta_dfs]
         sources = ["my_src1", "my_src2", "my_src3"]
         self.assertFalse(all_meta_df.index.is_unique, "during setup expected the index to not be unique")
 
         r = cg.build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta_df, all_meta_df_with_dups)
-        logger.debug("r:  {}".format(r))
-        self.assertEqual((3, 4), r.shape)
+        logger.debug("r:\n{}".format(r))
+        self.assertEqual((3, 5), r.shape)
         self.assertIn("source_file", r.columns)
-        self.assertTrue(all(r.index == "r3"))
+        self.assertIn("orig_rid", r.columns)
+        self.assertTrue(set(meta1.columns) < set(r.columns))
+        self.assertEqual({"r3"}, set(r.orig_rid))
 
 
 if __name__ == "__main__":