Skip to content

Commit ffa8407

Browse files
author
Dave Lahr
committed
pandasGEXpress/concat_gctoo: add doc strings to methods, slight rename
1 parent eb41ae0 commit ffa8407

File tree

2 files changed

+40
-10
lines changed

2 files changed

+40
-10
lines changed

cmapPy/pandasGEXpress/concat_gctoo.py

Lines changed: 33 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_
276276
all_meta_df_sorted (pandas df)
277277
278278
"""
279-
all_meta_df, all_meta_df_with_dups = build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)
279+
all_meta_df, all_meta_df_with_dups = build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)
280280

281281
if not all_meta_df.index.is_unique:
282282
all_report_df = build_mismatched_common_meta_report([x.shape for x in common_meta_dfs],
@@ -299,8 +299,25 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_
299299
return all_meta_df_sorted
300300

301301

302-
def build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
303-
# Remove any column headers that are not present in all dfs (and sort)
302+
def build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
303+
"""
304+
concatenate the entries in common_meta_dfs, removing columns selectively (fields_to_remove) or entirely (
305+
remove_all_metadata_fields=True; in this case, effectively just merges all the indexes in common_meta_dfs).
306+
307+
Returns 2 dataframes (in a tuple): the first has duplicates removed, the second does not.
308+
309+
Args:
310+
common_meta_dfs: collection of pandas DataFrames containing the metadata in the "common" direction of the
311+
concatenation operation
312+
fields_to_remove: columns to be removed (if present) from the common_meta_dfs
313+
remove_all_metadata_fields: boolean indicating that all metadata fields should be removed from the
314+
common_meta_dfs; overrides fields_to_remove if present
315+
316+
Returns:
317+
tuple containing
318+
all_meta_df: pandas dataframe that is the concatenation of the dataframes in common_meta_dfs,
319+
all_meta_df_with_dups:
320+
"""
304321

305322
if remove_all_metadata_fields:
306323
trimmed_common_meta_dfs = [pd.DataFrame(index=df.index) for df in common_meta_dfs]
@@ -338,6 +355,19 @@ def build_common_all_meta_dfs(common_meta_dfs, fields_to_remove, remove_all_meta
338355

339356

340357
def build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta_df, all_meta_df_with_dups):
358+
"""
359+
Generate a report (dataframe) that indicates for the common metadata that does not match across the common metadata
360+
which source file had which of the different mismatch values
361+
362+
Args:
363+
common_meta_df_shapes: list of tuples that are the shapes of the common meta dataframes
364+
sources: list of the source files that the dataframes were loaded from
365+
all_meta_df: produced from build_common_all_meta_df
366+
all_meta_df_with_dups: produced from build_common_all_meta_df
367+
368+
Returns:
369+
all_report_df: dataframe indicating the mismatched row metadata values and the corresponding source file
370+
"""
341371
expanded_sources = []
342372
for (i, shape) in enumerate(common_meta_df_shapes):
343373
src = sources[i]

cmapPy/pandasGEXpress/tests/test_concat_gctoo.py

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -219,7 +219,7 @@ def test_do_reset_ids(self):
219219
pd.util.testing.assert_frame_equal(meta_df, e_meta_df)
220220
pd.util.testing.assert_frame_equal(data_df, e_data_df)
221221

222-
def test_build_common_all_meta_dfs(self):
222+
def test_build_common_all_meta_df(self):
223223
# rhd3 header needs to be removed
224224
meta1 = pd.DataFrame(
225225
[["r1_1", "r1_2", "r1_3"],
@@ -240,15 +240,15 @@ def test_build_common_all_meta_dfs(self):
240240
index=["r1", "r2", "r3"],
241241
columns=["rhd1", "rhd2"])
242242

243-
r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], ["rhd3"], False)
243+
r_all, r_all_w_dups = cg.build_common_all_meta_df([meta1, meta2], ["rhd3"], False)
244244
logger.debug("rhd3 header needs to be removed - r_all:\n{}".format(r_all))
245245
logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
246246
self.assertEqual((3,2), r_all.shape)
247247
self.assertEqual((6,2), r_all_w_dups.shape)
248248
pd.util.testing.assert_frame_equal(e_meta1, r_all)
249249

250250
#remove all metadata fields
251-
r_all, r_all_w_dups = cg.build_common_all_meta_dfs([meta1, meta2], [], True)
251+
r_all, r_all_w_dups = cg.build_common_all_meta_df([meta1, meta2], [], True)
252252
logger.debug("remove all metadata fields - r_all\n{}".format(r_all))
253253
logger.debug("r_all_w_dups:\n{}".format(r_all_w_dups))
254254
self.assertEqual((3,0), r_all.shape)
@@ -273,7 +273,7 @@ def test_build_common_all_meta_dfs(self):
273273

274274
# rhd5 not in meta4, so it should be dropped even without being
275275
# explicitly provided
276-
out_meta3, _ = cg.build_common_all_meta_dfs([meta1, meta4], ["rhd2"], False)
276+
out_meta3, _ = cg.build_common_all_meta_df([meta1, meta4], ["rhd2"], False)
277277
logger.debug("""rhd5 not in meta4 so it should be automatically dropped without being
278278
explictly listed in fields_to_remove - out_meta3:
279279
{}""".format(out_meta3))
@@ -282,14 +282,14 @@ def test_build_common_all_meta_dfs(self):
282282
# Empty metadata
283283
empty_meta = pd.DataFrame([], index=["a", "b", "c"])
284284
logger.debug("empty metadata provided - empty_meta.empty: {}".format(empty_meta.empty))
285-
out_meta4, _ = cg.build_common_all_meta_dfs([empty_meta, empty_meta], [], False)
285+
out_meta4, _ = cg.build_common_all_meta_df([empty_meta, empty_meta], [], False)
286286
logger.debug("empty metadata provided - out_meta4:\n{}".format(out_meta4))
287287
pd.util.testing.assert_frame_equal(out_meta4, empty_meta)
288288

289289
#metadata has duplicates but index is unique
290290
meta5 = pd.DataFrame({"rhd1":[0,0,1]}, index=range(3))
291291
meta6 = pd.DataFrame({"rhd1":[0,0,1]}, index=range(3))
292-
out_meta5, _ = cg.build_common_all_meta_dfs([meta5, meta6], [], False)
292+
out_meta5, _ = cg.build_common_all_meta_df([meta5, meta6], [], False)
293293
logger.debug("metadata has duplicates but index is unique - out_meta5:\n{}".format(out_meta5))
294294
self.assertEqual((3,1), out_meta5.shape, "metadata contains duplicates but index is unique - should have been kept")
295295

@@ -319,7 +319,7 @@ def test_build_mismatched_common_meta_report(self):
319319
logger.debug("meta3:\n{}".format(meta3))
320320

321321
common_meta_dfs = [meta1, meta2, meta3]
322-
all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_dfs(common_meta_dfs, [], False)
322+
all_meta_df, all_meta_df_with_dups = cg.build_common_all_meta_df(common_meta_dfs, [], False)
323323
common_meta_df_shapes = [x.shape for x in common_meta_dfs]
324324
sources = ["my_src1", "my_src2", "my_src3"]
325325
self.assertFalse(all_meta_df.index.is_unique, "during setup expected the index to not be unique")

0 commit comments

Comments
 (0)