11"""
22slice_gct.py
33
4- Extract a subset of data from a gct file. If called from the command line,
5- ids can be provided as a list or as a path to a grp file. If using the
6- slice method in Python, ids or boolean arrays can be used .
4+ Extract a subset of data from a GCT(x) file using the command line. ids can
5+ be provided as a list or as a path to a grp file. See slice_gctoo for the
6+ equivalent method to be used on GCToo objects .
77
88"""
99import sys
1212import sys
1313import os
1414import argparse
15- import pandas as pd
16- import re
1715from cmapPy .pandasGEXpress import setup_GCToo_logger as setup_logger
18- from cmapPy .pandasGEXpress import GCToo
16+ from cmapPy .set_io import grp
17+ from cmapPy .pandasGEXpress import slice_gctoo as sg
1918from cmapPy .pandasGEXpress import parse_gct as pg
2019from cmapPy .pandasGEXpress import write_gct as wg
20+ from cmapPy .pandasGEXpress import write_gctx as wgx
2121
2222__author__ = "Lev Litichevskiy"
2323__email__ = "lev@broadinstitute.org"
@@ -40,14 +40,16 @@ def build_parser():
4040 parser .add_argument ("--exclude_cid" , "-ec" , nargs = "+" , help = "filepath to grp file or string array for excluding cols" )
4141 parser .add_argument ("--out_name" , "-o" , default = "ds_sliced.gct" ,
4242 help = "what to name the output file" )
43+ parser .add_argument ("--use_gctx" , action = "store_true" , default = False ,
44+ help = "whether to write output as GCTx" )
4345 parser .add_argument ("--verbose" , "-v" , action = "store_true" , default = False ,
4446 help = "whether to increase the # of messages reported" )
4547
4648 return parser
4749
4850
4951def main ():
50- # get args
52+ # Get args
5153 args = build_parser ().parse_args (sys .argv [1 :])
5254 setup_logger .setup (verbose = args .verbose )
5355
@@ -61,26 +63,20 @@ def main():
6163 exclude_cid = _read_arg (args .exclude_cid )
6264
6365 # Slice the gct
64- out_gct = slice_gctoo (in_gct , rid = rid , cid = cid , exclude_rid = exclude_rid , exclude_cid = exclude_cid )
66+ out_gct = sg . slice_gctoo (in_gct , rid = rid , cid = cid , exclude_rid = exclude_rid , exclude_cid = exclude_cid )
6567 assert out_gct .data_df .size > 0 , "Slicing yielded an empty gct!"
6668
6769 # Write the output gct
68- wg .write (out_gct , args .out_name , data_null = "NaN" , metadata_null = "NA" , filler_null = "NA" )
69-
70-
71- def read_grp (in_path ):
72- """ Read .grp file to a list. """
73-
74- with open (in_path , 'r' ) as f :
75- lines = f .readlines ()
76- # second conditional ignores comment lines
77- return [line .strip () for line in lines if line and not re .match ('^#' , line )]
70+ if args .use_gctx :
71+ wgx .write (out_gct , args .out_name )
72+ else :
73+ wg .write (out_gct , args .out_name , data_null = "NaN" , metadata_null = "NA" , filler_null = "NA" )
7874
7975
8076def _read_arg (arg ):
8177 """
8278 If arg is a list with 1 element that corresponds to a valid file path, use
83- plategrp to read the grp file. Otherwise, check that arg is a list of strings.
79+ set_io.grp to read the grp file. Otherwise, check that arg is a list of strings.
8480
8581 Args:
8682 arg (list or None)
@@ -96,7 +92,7 @@ def _read_arg(arg):
9692 else :
9793 # If len(arg) == 1 and arg[0] is a valid filepath, read it as a grp file
9894 if len (arg ) == 1 and os .path .exists (arg [0 ]):
99- arg_out = read_grp (arg [0 ])
95+ arg_out = grp . read (arg [0 ])
10096 else :
10197 arg_out = arg
10298
@@ -107,87 +103,5 @@ def _read_arg(arg):
107103 return arg_out
108104
109105
110- def slice_gctoo (gctoo , row_bool = None , col_bool = None , rid = None , cid = None , exclude_rid = None , exclude_cid = None ):
111- """ Extract a subset of data from a GCToo object in a variety of ways.
112-
113- Args:
114- gctoo (GCToo object)
115- row_bool (list of bools): length must equal gctoo.data_df.shape[0]
116- col_bool (list of bools): length must equal gctoo.data_df.shape[1]
117- rid (list of strings): length must equal gctoo.data_df.shape[0]
118- cid (list of strings): length must equal gctoo.data_df.shape[0]
119- exclude_rid (bool): if true, select row ids EXCLUDING 'rid' (default: False)
120- exclude_cid (bool): if true, select col ids EXCLUDING 'cid' (default: False)
121-
122- Returns:
123- out_gctoo (GCToo object): gctoo after slicing
124- """
125- assert (rid is None ) or (row_bool is None ), (
126- "rid and row_bool should not BOTH be provided." )
127- assert (cid is None ) or (col_bool is None ), (
128- "cid and col_bool should not BOTH be provided." )
129-
130- ### ROWS
131- # Use rid if provided
132- if rid is not None :
133- rows_to_keep = [gctoo_row for gctoo_row in gctoo .data_df .index if gctoo_row in rid ]
134-
135- else :
136- # Use row_bool if provided
137- if row_bool is not None :
138-
139- assert len (row_bool ) == gctoo .data_df .shape [0 ], (
140- "row_bool must have length equal to gctoo.data_df.shape[0]. " +
141- "len(row_bool): {}, gctoo.data_df.shape[0]: {}" .format (
142- len (row_bool ), gctoo .data_df .shape [0 ]))
143- rows_to_keep = gctoo .data_df .index [row_bool ].values
144-
145- else :
146- # If rid and row_bool are both None, return all rows
147- rows_to_keep = gctoo .data_df .index .values
148-
149- # Use exclude_rid if provided
150- if exclude_rid is not None :
151- # Keep only those rows that are not in exclude_rid
152- rows_to_keep = [row_to_keep for row_to_keep in rows_to_keep if row_to_keep not in exclude_rid ]
153-
154- ### COLUMNS
155- # Use cid if provided
156- if cid is not None :
157- cid = pd .Series (cid )
158- cols_to_keep = cid [cid .isin (gctoo .data_df .columns )]
159- else :
160- # Use col_bool if provided
161- if col_bool is not None :
162-
163- assert len (col_bool ) == gctoo .data_df .shape [1 ], (
164- "col_bool must have length equal to gctoo.data_df.shape[1]. " +
165- "len(col_bool): {}, gctoo.data_df.shape[1]: {}" .format (
166- len (col_bool ), gctoo .data_df .shape [1 ]))
167- cols_to_keep = gctoo .data_df .columns [col_bool ].values
168-
169- else :
170- # If cid and col_bool are both None, return all cols
171- cols_to_keep = gctoo .data_df .columns .values
172-
173- # Use exclude_cid if provided
174- if exclude_cid is not None :
175- # Keep only those cols that are not in exclude_cid
176- cols_to_keep = [col_to_keep for col_to_keep in cols_to_keep if col_to_keep not in exclude_cid ]
177-
178- # Convert labels to boolean array
179- rows_to_keep_bools = gctoo .data_df .index .isin (rows_to_keep )
180- cols_to_keep_bools = gctoo .data_df .columns .isin (cols_to_keep )
181-
182- # Make the output gct
183- out_gctoo = GCToo .GCToo (
184- src = gctoo .src , version = gctoo .version ,
185- data_df = gctoo .data_df .loc [rows_to_keep_bools , cols_to_keep_bools ],
186- row_metadata_df = gctoo .row_metadata_df .loc [rows_to_keep_bools , :],
187- col_metadata_df = gctoo .col_metadata_df .loc [cols_to_keep_bools , :])
188-
189- return out_gctoo
190-
191-
192106if __name__ == "__main__" :
193107 main ()
0 commit comments