Skip to content

Commit 4f288b8

Browse files
author
Dave Lahr
committed
pandasGEXpress/concat_gctoo: add options to 1) ignore all metadata fields when concat'ing 2) print a full report when encountering an error of mismatched common metadata
1 parent 245f9bd commit 4f288b8

File tree

2 files changed

+80
-32
lines changed

2 files changed

+80
-32
lines changed

cmapPy/pandasGEXpress/concat_gctoo.py

Lines changed: 38 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -73,6 +73,8 @@ def build_parser():
7373
help="what to name the output file")
7474
parser.add_argument("--fields_to_remove", "-ftr", nargs="+", default=[],
7575
help="fields to remove from the common metadata")
76+
parser.add_argument("--remove_all_metadata_fields", "-ramf", action="store_true", default=False,
77+
help="remove all metadata fields during operation")
7678
parser.add_argument("--reset_ids", "-rsi", action="store_true", default=False,
7779
help="whether to reset ids (use this flag if ids are not unique)")
7880

@@ -85,6 +87,10 @@ def build_parser():
8587
parser.add_argument("-verbose", "-v", action="store_true", default=False,
8688
help="whether to print a bunch of output")
8789

90+
parser.add_argument("--error_report_output_file", "-erof", type=str, default="concat_gctoo_errors.txt",
91+
help="""destination file for writing out error report - currently information about inconsistent
92+
metadata fields in the common dimension of the concat operation""")
93+
8894
return parser
8995

9096

@@ -154,7 +160,7 @@ def get_file_list(wildcard):
154160
return files
155161

156162

157-
def hstack(gctoos, fields_to_remove=[], reset_ids=False):
163+
def hstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
158164
""" Horizontally concatenate gctoos.
159165
160166
Args:
@@ -180,10 +186,10 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
180186
logger.debug("shapes of row_meta_dfs: {}".format([x.shape for x in row_meta_dfs]))
181187

182188
# Concatenate row metadata
183-
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs)
189+
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
184190

185191
# Concatenate col metadata
186-
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs)
192+
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs, remove_all_metadata_fields)
187193

188194
# Concatenate the data_dfs
189195
all_data_df = assemble_data(data_dfs, "horiz")
@@ -205,7 +211,7 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
205211
return concated
206212

207213

208-
def vstack(gctoos, fields_to_remove=[], reset_ids=False):
214+
def vstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
209215
""" Vertically concatenate gctoos.
210216
211217
Args:
@@ -229,10 +235,10 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
229235
srcs.append(g.src)
230236

231237
# Concatenate col metadata
232-
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs)
238+
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
233239

234240
# Concatenate col metadata
235-
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs)
241+
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs, remove_all_metadata_fields)
236242

237243
# Concatenate the data_dfs
238244
all_data_df = assemble_data(data_dfs, "vert")
@@ -254,7 +260,7 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
254260
return concated
255261

256262

257-
def assemble_common_meta(common_meta_dfs, fields_to_remove, sources):
263+
def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_metadata_fields, error_report_file):
258264
""" Assemble the common metadata dfs together. Both indices are sorted.
259265
Fields that are not in all the dfs are dropped.
260266
@@ -267,14 +273,17 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove, sources):
267273
all_meta_df_sorted (pandas df)
268274
269275
"""
270-
all_meta_df, all_meta_df_with_dups = build_common_all_meta_dfs(common_meta_dfs, fields_to_remove)
276+
all_meta_df, all_meta_df_with_dups = build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)
271277

272278
if not all_meta_df.index.is_unique:
273279
all_report_df = build_mismatched_common_meta_report([x.shape for x in common_meta_dfs],
274280
sources, all_meta_df, all_meta_df_with_dups)
275281

276282
unique_duplicate_ids = all_report_df.index.unique()
277283

284+
if error_report_file is not None:
285+
all_report_df.to_csv(error_report_file, sep="\t")
286+
278287
msg = """There are inconsistencies in common_metadata_df between different files. Try excluding metadata fields
279288
using the fields_to_remove argument. unique_duplicate_ids: {}
280289
all_report_df:
@@ -287,16 +296,20 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove, sources):
287296
return all_meta_df_sorted
288297

289298

290-
def build_common_all_meta_dfs(common_meta_dfs, fields_to_remove):
299+
def build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
291300
# Remove any column headers that are not present in all dfs (and sort)
292-
shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
293-
logger.debug("shared_column_headers: {}".format(shared_column_headers))
294301

295-
trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
302+
if remove_all_metadata_fields:
303+
trimmed_common_meta_dfs = [pd.DataFrame(index=df.index) for df in common_meta_dfs]
304+
else:
305+
shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
306+
logger.debug("shared_column_headers: {}".format(shared_column_headers))
296307

297-
# Remove any column headers that will prevent dfs from being identical
298-
for df in trimmed_common_meta_dfs:
299-
df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
308+
trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
309+
310+
# Remove any column headers that will prevent dfs from being identical
311+
for df in trimmed_common_meta_dfs:
312+
df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
300313

301314
# Concatenate all dfs and then remove duplicate rows
302315
all_meta_df_with_dups = pd.concat(trimmed_common_meta_dfs, axis=0)
@@ -352,11 +365,16 @@ def build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta
352365
report_df_list.append(report_df)
353366

354367
all_report_df = pd.concat(report_df_list, axis=0)
368+
all_report_df["orig_rid"] = all_report_df.index
369+
all_report_df.index = pd.Index(xrange(all_report_df.shape[0]), name="index")
370+
logger.debug("all_report_df.shape: {}".format(all_report_df.shape))
371+
logger.debug("all_report_df.index: {}".format(all_report_df.index))
372+
logger.debug("all_report_df.columns: {}".format(all_report_df.columns))
355373

356374
return all_report_df
357375

358376

359-
def assemble_concatenated_meta(concated_meta_dfs):
377+
def assemble_concatenated_meta(concated_meta_dfs, remove_all_metadata_fields):
360378
""" Assemble the concatenated metadata dfs together. For example,
361379
if horizontally concatenating, the concatenated metadata dfs are the
362380
column metadata dfs. Both indices are sorted.
@@ -369,6 +387,10 @@ def assemble_concatenated_meta(concated_meta_dfs):
369387
370388
"""
371389
# Concatenate the concated_meta_dfs
390+
if remove_all_metadata_fields:
391+
for df in concated_meta_dfs:
392+
df.drop(df.columns, axis=1, inplace=True)
393+
372394
all_concated_meta_df = pd.concat(concated_meta_dfs, axis=0)
373395

374396
# Sanity check: the number of rows in all_concated_meta_df should correspond

cmapPy/pandasGEXpress/tests/test_concat_gctoo.py

Lines changed: 42 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
77
import cmapPy.pandasGEXpress.concat_gctoo as cg
88
import cmapPy.pandasGEXpress.parse_gct as pg
9+
import tempfile
910

1011

1112
logger = logging.getLogger(setup_logger.LOGGER_NAME)
@@ -23,7 +24,7 @@ def test_left_right(self):
2324
expected_gct = pg.parse(expected_gct_path)
2425

2526
# Merge left and right
26-
concated_gct = cg.hstack([left_gct, right_gct], [], False)
27+
concated_gct = cg.hstack([left_gct, right_gct], False, None, [], False)
2728

2829
pd.util.testing.assert_frame_equal(expected_gct.data_df, concated_gct.data_df, check_names=False)
2930
pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, concated_gct.row_metadata_df, check_names=False)
@@ -39,7 +40,7 @@ def test_top_bottom(self):
3940
expected_gct = pg.parse(expected_gct_path)
4041

4142
# Merge top and bottom
42-
concated_gct = cg.vstack([top_gct, bottom_gct], [], False)
43+
concated_gct = cg.vstack([top_gct, bottom_gct], False, None, [], False)
4344

4445
pd.util.testing.assert_frame_equal(expected_gct.data_df, concated_gct.data_df, check_names=False)
4546
pd.util.testing.assert_frame_equal(expected_gct.row_metadata_df, concated_gct.row_metadata_df, check_names=False)
@@ -70,12 +71,21 @@ def test_assemble_common_meta(self):
7071
logger.debug("meta2:\n{}".format(meta2))
7172
logger.debug("e_meta:\n{}".format(e_meta1))
7273

74+
error_report_file = tempfile.NamedTemporaryFile().name
75+
logger.debug("rhd3 header needs to be removed - error_report_file: {}".format(error_report_file))
7376
with self.assertRaises(cg.MismatchCommonMetadataConcatGctooException) as e:
74-
cg.assemble_common_meta([meta1, meta2], [], ["my_src1", "my_src2"])
77+
cg.assemble_common_meta([meta1, meta2], [], ["my_src1", "my_src2"], False, error_report_file)
7578
self.assertIn("r3", str(e.exception))
7679
logger.debug("rhd3 header needs to be removed - e.exception: {}".format(e.exception))
80+
report_df = pd.read_csv(error_report_file, sep="\t")
81+
self.assertGreater(report_df.shape[0], 0)
82+
self.assertGreater(report_df.shape[1], 0)
83+
self.assertIn("source_file", report_df.columns)
84+
self.assertIn("orig_rid", report_df.columns)
85+
self.assertTrue(set(meta1.columns) < set(report_df.columns))
7786

78-
out_meta1 = cg.assemble_common_meta([meta1, meta2], ["rhd3"], None)
87+
88+
out_meta1 = cg.assemble_common_meta([meta1, meta2], ["rhd3"], None, False, None)
7989
logger.debug("out_meta1:\n{}".format(out_meta1))
8090
pd.util.testing.assert_frame_equal(out_meta1, e_meta1)
8191

@@ -95,7 +105,7 @@ def test_assemble_common_meta(self):
95105

96106
logger.debug("meta3:\n{}".format(meta3))
97107
logger.debug("e_meta2:\n{}".format(e_meta2))
98-
out_meta2 = cg.assemble_common_meta([meta1, meta3], [], None)
108+
out_meta2 = cg.assemble_common_meta([meta1, meta3], [], None, False, None)
99109
pd.util.testing.assert_frame_equal(out_meta2, e_meta2)
100110

101111
# Some ids not present in both dfs
@@ -109,7 +119,7 @@ def test_assemble_common_meta(self):
109119
logger.debug("meta4:\n{}".format(meta4))
110120

111121
with self.assertRaises(cg.MismatchCommonMetadataConcatGctooException) as e:
112-
cg.assemble_common_meta([meta1, meta4], [], ["my_src1", "my_src4"])
122+
cg.assemble_common_meta([meta1, meta4], [], ["my_src1", "my_src4"], False, None)
113123
self.assertIn("r1", str(e.exception))
114124

115125
def test_assemble_concatenated_meta(self):
@@ -132,9 +142,17 @@ def test_assemble_concatenated_meta(self):
132142
logger.debug("meta2:\n{}".format(meta2))
133143
logger.debug("e_concated:\n{}".format(e_concated))
134144

135-
concated = cg.assemble_concatenated_meta([meta2, meta1])
145+
concated = cg.assemble_concatenated_meta([meta2, meta1], False)
146+
logger.debug("happy path - concated:\n{}".format(concated))
136147
pd.util.testing.assert_frame_equal(e_concated, concated)
137148

149+
#remove all metadata
150+
r = cg.assemble_concatenated_meta([meta2, meta1], True)
151+
logger.debug("remove all metadata - r:\n{}".format(r))
152+
self.assertEqual((4,0), r.shape)
153+
self.assertTrue((e_concated.index == r.index).all())
154+
155+
138156
def test_assemble_data(self):
139157
# Horizontal concat
140158
df1 = pd.DataFrame(
@@ -220,13 +238,20 @@ def test_build_common_all_meta_dfs(self):
220238
index=["r1", "r2", "r3"],
221239
columns=["rhd1", "rhd2"])
222240

223-
r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], ["rhd3"])
241+
r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], ["rhd3"], False)
224242
logger.debug("rhd3 header needs to be removed - r_all:\n{}".format(r_all))
225243
logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
226244
self.assertEqual((3,2), r_all.shape)
227245
self.assertEqual((6,2), r_all_w_dups.shape)
228246
pd.util.testing.assert_frame_equal(e_meta1, r_all)
229247

248+
#remove all metadata fields
249+
r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], [], True)
250+
logger.debug("remove all metadata fields - r_all\n{}".format(r_all))
251+
logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
252+
self.assertEqual((3,0), r_all.shape)
253+
self.assertTrue((e_meta1.index == r_all.index).all())
254+
230255

231256
meta4 = pd.DataFrame(
232257
[["r1_1", "r1_22", "r1_5"],
@@ -246,7 +271,7 @@ def test_build_common_all_meta_dfs(self):
246271

247272
# rhd5 not in meta4, so it should be dropped even without being
248273
# explicitly provided
249-
out_meta3 = cg.assemble_common_meta([meta1, meta4], ["rhd2"], None)
274+
out_meta3, _ = cg.build_common_all_meta_dfs([meta1, meta4], ["rhd2"], False)
250275
logger.debug("""rhd5 not in meta4 so it should be automatically dropped without being
251276
explictly listed in fields_to_remove - out_meta3:
252277
{}""".format(out_meta3))
@@ -255,14 +280,14 @@ def test_build_common_all_meta_dfs(self):
255280
# Empty metadata
256281
empty_meta = pd.DataFrame([], index=["a", "b", "c"])
257282
logger.debug("empty metadata provided - empty_meta.empty: {}".format(empty_meta.empty))
258-
out_meta4 = cg.assemble_common_meta([empty_meta, empty_meta], [], None)
283+
out_meta4, _ = cg.build_common_all_meta_dfs([empty_meta, empty_meta], [], False)
259284
logger.debug("empty metadata provided - out_meta4:\n{}".format(out_meta4))
260285
pd.util.testing.assert_frame_equal(out_meta4, empty_meta)
261286

262287
#metadata has duplicates but index is unique
263288
meta5 = pd.DataFrame({"rhd1":[0,0,1]}, index=range(3))
264289
meta6 = pd.DataFrame({"rhd1":[0,0,1]}, index=range(3))
265-
out_meta5 = cg.assemble_common_meta([meta5, meta6], [], None)
290+
out_meta5, _ = cg.build_common_all_meta_dfs([meta5, meta6], [], False)
266291
logger.debug("metadata has duplicates but index is unique - out_meta5:\n{}".format(out_meta5))
267292
self.assertEqual((3,1), out_meta5.shape, "metadata contains duplicates but index is unique - should have been kept")
268293

@@ -290,19 +315,20 @@ def test_build_mismatched_common_meta_report(self):
290315
logger.debug("meta1:\n{}".format(meta1))
291316
logger.debug("meta2:\n{}".format(meta2))
292317
logger.debug("meta3:\n{}".format(meta3))
293-
# logger.debug("meta4:\n{}".format(meta4))
294318

295319
common_meta_dfs = [meta1, meta2, meta3]
296-
all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_dfs(common_meta_dfs, [])
320+
all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_dfs(common_meta_dfs, [], False)
297321
common_meta_df_shapes = [x.shape for x in common_meta_dfs]
298322
sources = ["my_src1", "my_src2", "my_src3"]
299323
self.assertFalse(all_meta_df.index.is_unique, "during setup expected the index to not be unique")
300324

301325
r = cg.build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta_df, all_meta_df_with_dups)
302-
logger.debug("r: {}".format(r))
303-
self.assertEqual((3, 4), r.shape)
326+
logger.debug("r:\n{}".format(r))
327+
self.assertEqual((3, 5), r.shape)
304328
self.assertIn("source_file", r.columns)
305-
self.assertTrue(all(r.index == "r3"))
329+
self.assertIn("orig_rid", r.columns)
330+
self.assertTrue(set(meta1.columns) < set(r.columns))
331+
self.assertEqual({"r3"}, set(r.orig_rid))
306332

307333

308334
if __name__ == "__main__":

0 commit comments

Comments
 (0)