Skip to content

Commit 0bc6203

Browse files
authored
Merge pull request #12 from cmap/upgrade_pandas_etc
Upgrade pandas etc
2 parents df84861 + ffa8407 commit 0bc6203

File tree

172 files changed

+1779
-41537
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

172 files changed

+1779
-41537
lines changed

cmapPy/pandasGEXpress/GCToo.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,11 +42,13 @@
4242
import numpy as np
4343
import pandas as pd
4444
import logging
45-
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
45+
import setup_GCToo_logger as setup_logger
46+
4647

4748
__authors__ = 'Oana Enache, Lev Litichevskiy, Dave Lahr'
4849
__email__ = '[email protected]'
4950

51+
5052
class GCToo(object):
5153
"""Class representing parsed gct(x) objects as pandas dataframes.
5254
Contains 3 component dataframes (row_metadata_df, column_metadata_df,

cmapPy/pandasGEXpress/__init__.py

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1 @@
1-
from .parse import parse
2-
#from .GCToo import GCToo
3-
#from .write_gctx import write_gctx
4-
#from .write_gct import write_gct
1+
from cmapPy.pandasGEXpress.parse import parse

cmapPy/pandasGEXpress/concat_gctoo.py

Lines changed: 138 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@
3838
import logging
3939
import setup_GCToo_logger as setup_logger
4040
import pandas as pd
41+
import numpy
4142

4243
import GCToo
4344
import parse
@@ -72,6 +73,8 @@ def build_parser():
7273
help="what to name the output file")
7374
parser.add_argument("--fields_to_remove", "-ftr", nargs="+", default=[],
7475
help="fields to remove from the common metadata")
76+
parser.add_argument("--remove_all_metadata_fields", "-ramf", action="store_true", default=False,
77+
help="remove all metadata fields during operation")
7578
parser.add_argument("--reset_ids", "-rsi", action="store_true", default=False,
7679
help="whether to reset ids (use this flag if ids are not unique)")
7780

@@ -84,13 +87,18 @@ def build_parser():
8487
parser.add_argument("-verbose", "-v", action="store_true", default=False,
8588
help="whether to print a bunch of output")
8689

90+
parser.add_argument("--error_report_output_file", "-erof", type=str, default=None,
91+
help="""destination file for writing out error report - currently information about inconsistent
92+
metadata fields in the common dimension of the concat operation""")
93+
8794
return parser
8895

8996

9097
def main():
9198
# get args
9299
args = build_parser().parse_args(sys.argv[1:])
93100
setup_logger.setup(verbose=args.verbose)
101+
logger.debug("args: {}".format(args))
94102

95103
# Get files directly
96104
if args.input_filepaths is not None:
@@ -120,10 +128,12 @@ def main():
120128

121129
# Create concatenated gctoo object
122130
if args.concat_direction == "horiz":
123-
out_gctoo = hstack(gctoos, args.fields_to_remove, args.reset_ids)
131+
out_gctoo = hstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
132+
args.fields_to_remove, args.reset_ids)
124133

125134
elif args.concat_direction == "vert":
126-
out_gctoo = vstack(gctoos, args.fields_to_remove, args.reset_ids)
135+
out_gctoo = vstack(gctoos, args.remove_all_metadata_fields, args.error_report_output_file,
136+
args.fields_to_remove, args.reset_ids)
127137

128138
# Write out_gctoo to file
129139
logger.info("Writing to output file args.out_name: {}".format(args.out_name))
@@ -153,7 +163,7 @@ def get_file_list(wildcard):
153163
return files
154164

155165

156-
def hstack(gctoos, fields_to_remove=[], reset_ids=False):
166+
def hstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
157167
""" Horizontally concatenate gctoos.
158168
159169
Args:
@@ -169,18 +179,20 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
169179
row_meta_dfs = []
170180
col_meta_dfs = []
171181
data_dfs = []
182+
srcs = []
172183
for g in gctoos:
173184
row_meta_dfs.append(g.row_metadata_df)
174185
col_meta_dfs.append(g.col_metadata_df)
175186
data_dfs.append(g.data_df)
187+
srcs.append(g.src)
176188

177189
logger.debug("shapes of row_meta_dfs: {}".format([x.shape for x in row_meta_dfs]))
178190

179191
# Concatenate row metadata
180-
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove)
192+
all_row_metadata_df = assemble_common_meta(row_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
181193

182194
# Concatenate col metadata
183-
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs)
195+
all_col_metadata_df = assemble_concatenated_meta(col_meta_dfs, remove_all_metadata_fields)
184196

185197
# Concatenate the data_dfs
186198
all_data_df = assemble_data(data_dfs, "horiz")
@@ -202,7 +214,7 @@ def hstack(gctoos, fields_to_remove=[], reset_ids=False):
202214
return concated
203215

204216

205-
def vstack(gctoos, fields_to_remove=[], reset_ids=False):
217+
def vstack(gctoos, remove_all_metadata_fields, error_report_file, fields_to_remove=[], reset_ids=False):
206218
""" Vertically concatenate gctoos.
207219
208220
Args:
@@ -218,16 +230,18 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
218230
row_meta_dfs = []
219231
col_meta_dfs = []
220232
data_dfs = []
233+
srcs = []
221234
for g in gctoos:
222235
row_meta_dfs.append(g.row_metadata_df)
223236
col_meta_dfs.append(g.col_metadata_df)
224237
data_dfs.append(g.data_df)
238+
srcs.append(g.src)
225239

226240
# Concatenate col metadata
227-
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove)
241+
all_col_metadata_df = assemble_common_meta(col_meta_dfs, fields_to_remove, srcs, remove_all_metadata_fields, error_report_file)
228242

229243
# Concatenate col metadata
230-
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs)
244+
all_row_metadata_df = assemble_concatenated_meta(row_meta_dfs, remove_all_metadata_fields)
231245

232246
# Concatenate the data_dfs
233247
all_data_df = assemble_data(data_dfs, "vert")
@@ -249,7 +263,7 @@ def vstack(gctoos, fields_to_remove=[], reset_ids=False):
249263
return concated
250264

251265

252-
def assemble_common_meta(common_meta_dfs, fields_to_remove):
266+
def assemble_common_meta(common_meta_dfs, fields_to_remove, sources, remove_all_metadata_fields, error_report_file):
253267
""" Assemble the common metadata dfs together. Both indices are sorted.
254268
Fields that are not in all the dfs are dropped.
255269
@@ -262,50 +276,138 @@ def assemble_common_meta(common_meta_dfs, fields_to_remove):
262276
all_meta_df_sorted (pandas df)
263277
264278
"""
265-
# Remove any column headers that are not present in all dfs (and sort)
266-
shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
267-
trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
279+
all_meta_df, all_meta_df_with_dups = build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields)
280+
281+
if not all_meta_df.index.is_unique:
282+
all_report_df = build_mismatched_common_meta_report([x.shape for x in common_meta_dfs],
283+
sources, all_meta_df, all_meta_df_with_dups)
284+
285+
unique_duplicate_ids = all_report_df.index.unique()
286+
287+
if error_report_file is not None:
288+
all_report_df.to_csv(error_report_file, sep="\t")
289+
290+
msg = """There are inconsistencies in common_metadata_df between different files. Try excluding metadata fields
291+
using the fields_to_remove argument. unique_duplicate_ids: {}
292+
all_report_df:
293+
{}""".format(unique_duplicate_ids, all_report_df)
294+
raise MismatchCommonMetadataConcatGctooException(msg)
295+
296+
# Finally, sort the index
297+
all_meta_df_sorted = all_meta_df.sort_index(axis=0)
298+
299+
return all_meta_df_sorted
300+
301+
302+
def build_common_all_meta_df(common_meta_dfs, fields_to_remove, remove_all_metadata_fields):
303+
"""
304+
concatenate the entries in common_meta_dfs, removing columns selectively (fields_to_remove) or entirely (
305+
remove_all_metadata_fields=True; in this case, effectively just merges all the indexes in common_meta_dfs).
306+
307+
Returns 2 dataframes (in a tuple): the first has duplicates removed, the second does not.
308+
309+
Args:
310+
common_meta_dfs: collection of pandas DataFrames containing the metadata in the "common" direction of the
311+
concatenation operation
312+
fields_to_remove: columns to be removed (if present) from the common_meta_dfs
313+
remove_all_metadata_fields: boolean indicating that all metadata fields should be removed from the
314+
common_meta_dfs; overrides fields_to_remove if present
315+
316+
Returns:
317+
tuple containing
318+
all_meta_df: pandas dataframe that is the concatenation of the dataframes in common_meta_dfs,
319+
all_meta_df_with_dups:
320+
"""
321+
322+
if remove_all_metadata_fields:
323+
trimmed_common_meta_dfs = [pd.DataFrame(index=df.index) for df in common_meta_dfs]
324+
else:
325+
shared_column_headers = sorted(set.intersection(*[set(df.columns) for df in common_meta_dfs]))
326+
logger.debug("shared_column_headers: {}".format(shared_column_headers))
268327

269-
# Remove any column headers that will prevent dfs from being identical
270-
for df in trimmed_common_meta_dfs:
271-
df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
328+
trimmed_common_meta_dfs = [df[shared_column_headers] for df in common_meta_dfs]
329+
330+
# Remove any column headers that will prevent dfs from being identical
331+
for df in trimmed_common_meta_dfs:
332+
df.drop(fields_to_remove, axis=1, errors="ignore", inplace=True)
272333

273334
# Concatenate all dfs and then remove duplicate rows
274335
all_meta_df_with_dups = pd.concat(trimmed_common_meta_dfs, axis=0)
336+
logger.debug("all_meta_df_with_dups.shape: {}".format(all_meta_df_with_dups.shape))
337+
logger.debug("all_meta_df_with_dups.columns: {}".format(all_meta_df_with_dups.columns))
338+
logger.debug("all_meta_df_with_dups.index: {}".format(all_meta_df_with_dups.index))
275339

276340
# If all metadata dfs were empty, df will be empty
277341
if all_meta_df_with_dups.empty:
278-
279342
# Simply return unique ids
280343
all_meta_df = pd.DataFrame(index=all_meta_df_with_dups.index.unique())
281344

282345
else:
283346
all_meta_df_with_dups["concat_gctoo_column_for_index"] = all_meta_df_with_dups.index
284347
all_meta_df = all_meta_df_with_dups.copy(deep=True).drop_duplicates()
285348
all_meta_df.drop("concat_gctoo_column_for_index", axis=1, inplace=True)
349+
all_meta_df_with_dups.drop("concat_gctoo_column_for_index", axis=1, inplace=True)
286350

287351
logger.debug("all_meta_df_with_dups.shape: {}".format(all_meta_df_with_dups.shape))
288352
logger.debug("all_meta_df.shape: {}".format(all_meta_df.shape))
289353

290-
# If there are still duplicate ids, then their metadata didn't align
291-
# in different gcts
354+
return (all_meta_df, all_meta_df_with_dups)
355+
356+
357+
def build_mismatched_common_meta_report(common_meta_df_shapes, sources, all_meta_df, all_meta_df_with_dups):
358+
"""
359+
Generate a report (dataframe) that indicates for the common metadata that does not match across the common metadata
360+
which source file had which of the different mismatch values
361+
362+
Args:
363+
common_meta_df_shapes: list of tuples that are the shapes of the common meta dataframes
364+
sources: list of the source files that the dataframes were loaded from
365+
all_meta_df: produced from build_common_all_meta_df
366+
all_meta_df_with_dups: produced from build_common_all_meta_df
367+
368+
Returns:
369+
all_report_df: dataframe indicating the mismatched row metadata values and the corresponding source file
370+
"""
371+
expanded_sources = []
372+
for (i, shape) in enumerate(common_meta_df_shapes):
373+
src = sources[i]
374+
expanded_sources.extend([src for i in xrange(shape[0])])
375+
expanded_sources = numpy.array(expanded_sources)
376+
logger.debug("len(expanded_sources): {}".format(len(expanded_sources)))
377+
292378
duplicate_ids = all_meta_df.index[all_meta_df.index.duplicated(keep=False)]
293379

294-
assert all_meta_df.index.is_unique, (
295-
("There are inconsistencies in common_metadata_df between " +
296-
"different files.\nTry excluding metadata fields " +
297-
"using the fields_to_remove argument.\n"
298-
"duplicate_ids[0]: {id}\n" +
299-
"all_meta_df.loc[{id}, :]:\n{df}").format(id=duplicate_ids[0],
300-
df=all_meta_df.loc[duplicate_ids[0], :]))
380+
unique_duplicate_ids = duplicate_ids.unique()
381+
logger.debug("unique_duplicate_ids: {}".format(unique_duplicate_ids))
301382

302-
# Finally, sort the index
303-
all_meta_df_sorted = all_meta_df.sort_index(axis=0)
383+
duplicate_ids_meta_df = all_meta_df.loc[unique_duplicate_ids]
304384

305-
return all_meta_df_sorted
385+
report_df_list = []
386+
for unique_dup_id in unique_duplicate_ids:
387+
rows = duplicate_ids_meta_df.loc[unique_dup_id]
388+
389+
matching_row_locs = numpy.array([False for i in xrange(all_meta_df_with_dups.shape[0])])
390+
for i in xrange(rows.shape[0]):
391+
r = rows.iloc[i]
392+
row_comparison = r == all_meta_df_with_dups
393+
matching_row_locs = matching_row_locs | row_comparison.all(axis=1).values
306394

395+
report_df = all_meta_df_with_dups.loc[matching_row_locs].copy()
396+
report_df["source_file"] = expanded_sources[matching_row_locs]
397+
logger.debug("report_df.shape: {}".format(report_df.shape))
398+
report_df_list.append(report_df)
307399

308-
def assemble_concatenated_meta(concated_meta_dfs):
400+
all_report_df = pd.concat(report_df_list, axis=0)
401+
all_report_df["orig_rid"] = all_report_df.index
402+
all_report_df.index = pd.Index(xrange(all_report_df.shape[0]), name="index")
403+
logger.debug("all_report_df.shape: {}".format(all_report_df.shape))
404+
logger.debug("all_report_df.index: {}".format(all_report_df.index))
405+
logger.debug("all_report_df.columns: {}".format(all_report_df.columns))
406+
407+
return all_report_df
408+
409+
410+
def assemble_concatenated_meta(concated_meta_dfs, remove_all_metadata_fields):
309411
""" Assemble the concatenated metadata dfs together. For example,
310412
if horizontally concatenating, the concatenated metadata dfs are the
311413
column metadata dfs. Both indices are sorted.
@@ -318,6 +420,10 @@ def assemble_concatenated_meta(concated_meta_dfs):
318420
319421
"""
320422
# Concatenate the concated_meta_dfs
423+
if remove_all_metadata_fields:
424+
for df in concated_meta_dfs:
425+
df.drop(df.columns, axis=1, inplace=True)
426+
321427
all_concated_meta_df = pd.concat(concated_meta_dfs, axis=0)
322428

323429
# Sanity check: the number of rows in all_concated_meta_df should correspond
@@ -430,5 +536,8 @@ def reset_ids_in_meta_df(meta_df):
430536
meta_df.index.name = original_index_name
431537

432538

539+
class MismatchCommonMetadataConcatGctooException(Exception):
540+
pass
541+
433542
if __name__ == "__main__":
434543
main()

cmapPy/pandasGEXpress/gct2gctx.py

Lines changed: 23 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -9,10 +9,9 @@
99
"""
1010

1111
import logging
12-
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
12+
import setup_GCToo_logger as setup_logger
1313
import argparse
1414
import sys
15-
import GCToo
1615
import parse_gct
1716
import write_gctx
1817

@@ -23,31 +22,32 @@
2322

2423

2524
def build_parser():
26-
parser = argparse.ArgumentParser(description=__doc__,
27-
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
28-
# required
29-
parser.add_argument("-filename",
30-
help=".gct file that you would like converted to .gctx form")
31-
# optional
32-
parser.add_argument("-output_filepath",
33-
help="(optional) out path/name for output gctx file", default=None)
34-
parser.add_argument("-verbose", "-v",
35-
help="Whether to print a bunch of output.", action="store_true", default=False)
36-
return parser
25+
parser = argparse.ArgumentParser(description=__doc__,
26+
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
27+
# required
28+
parser.add_argument("-filename",
29+
help=".gct file that you would like converted to .gctx form")
30+
# optional
31+
parser.add_argument("-output_filepath",
32+
help="(optional) out path/name for output gctx file", default=None)
33+
parser.add_argument("-verbose", "-v",
34+
help="Whether to print a bunch of output.", action="store_true", default=False)
35+
return parser
36+
3737

3838
def main():
39-
args = build_parser().parse_args(sys.argv[1:])
40-
setup_logger.setup(verbose=args.verbose)
41-
in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
42-
logger.debug("Original out name: {}".format(in_gctoo.src))
39+
args = build_parser().parse_args(sys.argv[1:])
40+
setup_logger.setup(verbose=args.verbose)
41+
in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
42+
logger.debug("Original out name: {}".format(in_gctoo.src))
4343

44-
if args.output_filepath == None:
45-
out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
46-
else:
47-
out_name = args.output_filepath
44+
if args.output_filepath == None:
45+
out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
46+
else:
47+
out_name = args.output_filepath
4848

49-
write_gctx.write(in_gctoo, out_name)
49+
write_gctx.write(in_gctoo, out_name)
5050

5151

5252
if __name__ == "__main__":
53-
main()
53+
main()

0 commit comments

Comments
 (0)