Skip to content

Commit 5ca9900

Browse files
author
lev
committed
pandasGEXpress/slice_gctoo.py: separated slice_gct and slice_gctoo into separate modules to allow for subsetting with parse_gct (otherwise, we get circular imports)
1 parent 5e6429c commit 5ca9900

File tree

7 files changed

+213
-145
lines changed

7 files changed

+213
-145
lines changed

cmapPy/pandasGEXpress/slice_gct.py

Lines changed: 16 additions & 102 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,9 @@
11
"""
22
slice_gct.py
33
4-
Extract a subset of data from a gct file. If called from the command line,
5-
ids can be provided as a list or as a path to a grp file. If using the
6-
slice method in Python, ids or boolean arrays can be used.
4+
Extract a subset of data from a GCT(x) file using the command line. ids can
5+
be provided as a list or as a path to a grp file. See slice_gctoo for the
6+
equivalent method to be used on GCToo objects.
77
88
"""
99
import sys
@@ -12,12 +12,12 @@
1212
import sys
1313
import os
1414
import argparse
15-
import pandas as pd
16-
import re
1715
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
18-
from cmapPy.pandasGEXpress import GCToo
16+
from cmapPy.set_io import grp
17+
from cmapPy.pandasGEXpress import slice_gctoo as sg
1918
from cmapPy.pandasGEXpress import parse_gct as pg
2019
from cmapPy.pandasGEXpress import write_gct as wg
20+
from cmapPy.pandasGEXpress import write_gctx as wgx
2121

2222
__author__ = "Lev Litichevskiy"
2323
__email__ = "lev@broadinstitute.org"
@@ -40,14 +40,16 @@ def build_parser():
4040
parser.add_argument("--exclude_cid", "-ec", nargs="+", help="filepath to grp file or string array for excluding cols")
4141
parser.add_argument("--out_name", "-o", default="ds_sliced.gct",
4242
help="what to name the output file")
43+
parser.add_argument("--use_gctx", action="store_true", default=False,
44+
help="whether to write output as GCTx")
4345
parser.add_argument("--verbose", "-v", action="store_true", default=False,
4446
help="whether to increase the # of messages reported")
4547

4648
return parser
4749

4850

4951
def main():
50-
# get args
52+
# Get args
5153
args = build_parser().parse_args(sys.argv[1:])
5254
setup_logger.setup(verbose=args.verbose)
5355

@@ -61,26 +63,20 @@ def main():
6163
exclude_cid = _read_arg(args.exclude_cid)
6264

6365
# Slice the gct
64-
out_gct = slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid)
66+
out_gct = sg.slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid)
6567
assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!"
6668

6769
# Write the output gct
68-
wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
69-
70-
71-
def read_grp(in_path):
72-
""" Read .grp file to a list. """
73-
74-
with open(in_path, 'r') as f:
75-
lines = f.readlines()
76-
# second conditional ignores comment lines
77-
return [line.strip() for line in lines if line and not re.match('^#', line)]
70+
if args.use_gctx:
71+
wgx.write(out_gct, args.out_name)
72+
else:
73+
wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
7874

7975

8076
def _read_arg(arg):
8177
"""
8278
If arg is a list with 1 element that corresponds to a valid file path, use
83-
plategrp to read the grp file. Otherwise, check that arg is a list of strings.
79+
set_io.grp to read the grp file. Otherwise, check that arg is a list of strings.
8480
8581
Args:
8682
arg (list or None)
@@ -96,7 +92,7 @@ def _read_arg(arg):
9692
else:
9793
# If len(arg) == 1 and arg[0] is a valid filepath, read it as a grp file
9894
if len(arg) == 1 and os.path.exists(arg[0]):
99-
arg_out = read_grp(arg[0])
95+
arg_out = grp.read(arg[0])
10096
else:
10197
arg_out = arg
10298

@@ -107,87 +103,5 @@ def _read_arg(arg):
107103
return arg_out
108104

109105

110-
def slice_gctoo(gctoo, row_bool=None, col_bool=None, rid=None, cid=None, exclude_rid=None, exclude_cid=None):
111-
""" Extract a subset of data from a GCToo object in a variety of ways.
112-
113-
Args:
114-
gctoo (GCToo object)
115-
row_bool (list of bools): length must equal gctoo.data_df.shape[0]
116-
col_bool (list of bools): length must equal gctoo.data_df.shape[1]
117-
rid (list of strings): length must equal gctoo.data_df.shape[0]
118-
cid (list of strings): length must equal gctoo.data_df.shape[0]
119-
exclude_rid (bool): if true, select row ids EXCLUDING 'rid' (default: False)
120-
exclude_cid (bool): if true, select col ids EXCLUDING 'cid' (default: False)
121-
122-
Returns:
123-
out_gctoo (GCToo object): gctoo after slicing
124-
"""
125-
assert (rid is None) or (row_bool is None), (
126-
"rid and row_bool should not BOTH be provided.")
127-
assert (cid is None) or (col_bool is None), (
128-
"cid and col_bool should not BOTH be provided.")
129-
130-
### ROWS
131-
# Use rid if provided
132-
if rid is not None:
133-
rows_to_keep = [gctoo_row for gctoo_row in gctoo.data_df.index if gctoo_row in rid]
134-
135-
else:
136-
# Use row_bool if provided
137-
if row_bool is not None:
138-
139-
assert len(row_bool) == gctoo.data_df.shape[0], (
140-
"row_bool must have length equal to gctoo.data_df.shape[0]. " +
141-
"len(row_bool): {}, gctoo.data_df.shape[0]: {}".format(
142-
len(row_bool), gctoo.data_df.shape[0]))
143-
rows_to_keep = gctoo.data_df.index[row_bool].values
144-
145-
else:
146-
# If rid and row_bool are both None, return all rows
147-
rows_to_keep = gctoo.data_df.index.values
148-
149-
# Use exclude_rid if provided
150-
if exclude_rid is not None:
151-
# Keep only those rows that are not in exclude_rid
152-
rows_to_keep = [row_to_keep for row_to_keep in rows_to_keep if row_to_keep not in exclude_rid]
153-
154-
### COLUMNS
155-
# Use cid if provided
156-
if cid is not None:
157-
cid = pd.Series(cid)
158-
cols_to_keep = cid[cid.isin(gctoo.data_df.columns)]
159-
else:
160-
# Use col_bool if provided
161-
if col_bool is not None:
162-
163-
assert len(col_bool) == gctoo.data_df.shape[1], (
164-
"col_bool must have length equal to gctoo.data_df.shape[1]. " +
165-
"len(col_bool): {}, gctoo.data_df.shape[1]: {}".format(
166-
len(col_bool), gctoo.data_df.shape[1]))
167-
cols_to_keep = gctoo.data_df.columns[col_bool].values
168-
169-
else:
170-
# If cid and col_bool are both None, return all cols
171-
cols_to_keep = gctoo.data_df.columns.values
172-
173-
# Use exclude_cid if provided
174-
if exclude_cid is not None:
175-
# Keep only those cols that are not in exclude_cid
176-
cols_to_keep = [col_to_keep for col_to_keep in cols_to_keep if col_to_keep not in exclude_cid]
177-
178-
# Convert labels to boolean array
179-
rows_to_keep_bools = gctoo.data_df.index.isin(rows_to_keep)
180-
cols_to_keep_bools = gctoo.data_df.columns.isin(cols_to_keep)
181-
182-
# Make the output gct
183-
out_gctoo = GCToo.GCToo(
184-
src= gctoo.src, version = gctoo.version,
185-
data_df=gctoo.data_df.loc[rows_to_keep_bools, cols_to_keep_bools],
186-
row_metadata_df=gctoo.row_metadata_df.loc[rows_to_keep_bools, :],
187-
col_metadata_df=gctoo.col_metadata_df.loc[cols_to_keep_bools, :])
188-
189-
return out_gctoo
190-
191-
192106
if __name__ == "__main__":
193107
main()
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
"""
2+
slice_gctoo.py
3+
4+
Extract a subset of data from a GCToo object using ids or boolean arrays.
5+
See slice_gct.py for the command line equivalent.
6+
7+
"""
8+
import sys
9+
sys.path.insert(0, "../..")
10+
import logging
11+
import pandas as pd
12+
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
13+
from cmapPy.pandasGEXpress import GCToo
14+
15+
__author__ = "Lev Litichevskiy"
16+
__email__ = "lev@broadinstitute.org"
17+
18+
logger = logging.getLogger(setup_logger.LOGGER_NAME)
19+
20+
21+
def slice_gctoo(gctoo, row_bool=None, col_bool=None, rid=None, cid=None, exclude_rid=None, exclude_cid=None):
22+
""" Extract a subset of data from a GCToo object in a variety of ways.
23+
24+
Args:
25+
gctoo (GCToo object)
26+
row_bool (list of bools): length must equal gctoo.data_df.shape[0]
27+
col_bool (list of bools): length must equal gctoo.data_df.shape[1]
28+
rid (list of strings): length must equal gctoo.data_df.shape[0]
29+
cid (list of strings): length must equal gctoo.data_df.shape[0]
30+
exclude_rid (bool): if true, select row ids EXCLUDING 'rid' (default: False)
31+
exclude_cid (bool): if true, select col ids EXCLUDING 'cid' (default: False)
32+
33+
Returns:
34+
out_gctoo (GCToo object): gctoo after slicing
35+
"""
36+
assert (rid is None) or (row_bool is None), (
37+
"rid and row_bool should not BOTH be provided.")
38+
assert (cid is None) or (col_bool is None), (
39+
"cid and col_bool should not BOTH be provided.")
40+
41+
### ROWS
42+
# Use rid if provided
43+
if rid is not None:
44+
rows_to_keep = [gctoo_row for gctoo_row in gctoo.data_df.index if gctoo_row in rid]
45+
46+
else:
47+
# Use row_bool if provided
48+
if row_bool is not None:
49+
50+
assert len(row_bool) == gctoo.data_df.shape[0], (
51+
"row_bool must have length equal to gctoo.data_df.shape[0]. " +
52+
"len(row_bool): {}, gctoo.data_df.shape[0]: {}".format(
53+
len(row_bool), gctoo.data_df.shape[0]))
54+
rows_to_keep = gctoo.data_df.index[row_bool].values
55+
56+
else:
57+
# If rid and row_bool are both None, return all rows
58+
rows_to_keep = gctoo.data_df.index.values
59+
60+
# Use exclude_rid if provided
61+
if exclude_rid is not None:
62+
# Keep only those rows that are not in exclude_rid
63+
rows_to_keep = [row_to_keep for row_to_keep in rows_to_keep if row_to_keep not in exclude_rid]
64+
65+
### COLUMNS
66+
# Use cid if provided
67+
if cid is not None:
68+
cid = pd.Series(cid)
69+
cols_to_keep = cid[cid.isin(gctoo.data_df.columns)]
70+
else:
71+
# Use col_bool if provided
72+
if col_bool is not None:
73+
74+
assert len(col_bool) == gctoo.data_df.shape[1], (
75+
"col_bool must have length equal to gctoo.data_df.shape[1]. " +
76+
"len(col_bool): {}, gctoo.data_df.shape[1]: {}".format(
77+
len(col_bool), gctoo.data_df.shape[1]))
78+
cols_to_keep = gctoo.data_df.columns[col_bool].values
79+
80+
else:
81+
# If cid and col_bool are both None, return all cols
82+
cols_to_keep = gctoo.data_df.columns.values
83+
84+
# Use exclude_cid if provided
85+
if exclude_cid is not None:
86+
# Keep only those cols that are not in exclude_cid
87+
cols_to_keep = [col_to_keep for col_to_keep in cols_to_keep if col_to_keep not in exclude_cid]
88+
89+
# Convert labels to boolean array
90+
rows_to_keep_bools = gctoo.data_df.index.isin(rows_to_keep)
91+
cols_to_keep_bools = gctoo.data_df.columns.isin(cols_to_keep)
92+
93+
# Make the output gct
94+
out_gctoo = GCToo.GCToo(
95+
src=gctoo.src, version=gctoo.version,
96+
data_df=gctoo.data_df.loc[rows_to_keep_bools, cols_to_keep_bools],
97+
row_metadata_df=gctoo.row_metadata_df.loc[rows_to_keep_bools, :],
98+
col_metadata_df=gctoo.col_metadata_df.loc[cols_to_keep_bools, :])
99+
100+
logger.info(("Initial GCToo with {} rows and {} columns sliced down to " +
101+
"{} rows and {} columns.").format(
102+
gctoo.data_df.shape[0], gctoo.data_df.shape[1],
103+
out_gctoo.data_df.shape[0], out_gctoo.data_df.shape[1]))
104+
105+
return out_gctoo
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
# used by test_ds_slice
1+
# used by test_slice_gct
22
a
33
Bb
44
c

cmapPy/pandasGEXpress/tests/test_parse.py

Lines changed: 3 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,14 +3,9 @@
33
import logging
44
from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
55
import unittest
6-
import pandas as pd
76
import pandas.util.testing as pandas_testing
87
from cmapPy.pandasGEXpress import parse
9-
from cmapPy.pandasGEXpress import mini_gctoo_for_testing as mini_gctoo_for_testing
10-
from cmapPy.pandasGEXpress import slice_gct as slice_gct
11-
12-
from cmapPy.pandasGEXpress import GCToo as GCToo
13-
from cmapPy.pandasGEXpress import parse_gctx as parse_gctx
8+
from cmapPy.pandasGEXpress import slice_gctoo as slice_gctoo
149
from cmapPy.pandasGEXpress import mini_gctoo_for_testing as mini_gctoo_for_testing
1510

1611
__author__ = "Oana Enache"
@@ -40,15 +35,15 @@ def test_gctx_parsing(self):
4035
# parsing w/rids & cids specified
4136
test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
4237
test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
43-
mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
38+
mg3 = slice_gctoo.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
4439
mg4 = parse("functional_tests/mini_gctoo_for_testing.gctx",
4540
rid=test_rids, cid=test_cids)
4641
pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
4742
pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
4843
pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)
4944

5045
# parsing w/ridx & cidx specified
51-
mg5 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
46+
mg5 = slice_gctoo.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
5247
cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
5348
mg6 = parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])
5449

cmapPy/pandasGEXpress/tests/test_parse_gctx.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
from cmapPy.pandasGEXpress import GCToo as GCToo
1111
from cmapPy.pandasGEXpress import parse_gctx as parse_gctx
1212
from cmapPy.pandasGEXpress import mini_gctoo_for_testing as mini_gctoo_for_testing
13-
from cmapPy.pandasGEXpress import slice_gct as slice_gct
13+
from cmapPy.pandasGEXpress import slice_gctoo as slice_gctoo
1414
from cmapPy.pandasGEXpress import write_gctx as write_gctx
1515
import pandas.util.testing as pandas_testing
1616
from six.moves import range
@@ -55,7 +55,7 @@ def test_parse(self):
5555
# test with string rid/cid
5656
test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
5757
test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
58-
mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
58+
mg3 = slice_gctoo.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
5959
mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
6060
rid=test_rids, cid=test_cids)
6161
pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
@@ -82,7 +82,7 @@ def test_parse(self):
8282
# test with numeric (repr as string) rid/cid
8383
mg5 = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta,
8484
col_metadata_df=int_indexed_col_meta)
85-
mg5 = slice_gct.slice_gctoo(mg5, row_bool=[True, False, True, False, True, False],
85+
mg5 = slice_gctoo.slice_gctoo(mg5, row_bool=[True, False, True, False, True, False],
8686
col_bool=[True, False, False, True, True, True])
8787

8888
mg5.data_df.index.name = "rid"
@@ -104,7 +104,7 @@ def test_parse(self):
104104
pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)
105105

106106
# test with ridx/cidx
107-
mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
107+
mg7 = slice_gctoo.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
108108
cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
109109
mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])
110110

0 commit comments

Comments
 (0)