pandasGEXpress/slice_gctoo.py: separated slice_gct and slice_gctoo into separate modules to allow for subsetting with parse_gct (otherwise, we get circular imports)

lev · lev · commit 5ca9900f6d2c · 2018-02-13T11:20:00.000-05:00
diff --git a/cmapPy/pandasGEXpress/slice_gct.py b/cmapPy/pandasGEXpress/slice_gct.py
@@ -1,9 +1,9 @@
 """
 slice_gct.py
 
-Extract a subset of data from a gct file. If called from the command line,
-ids can be provided as a list or as a path to a grp file. If using the
-slice method in Python, ids or boolean arrays can be used.
+Extract a subset of data from a GCT(x) file using the command line. ids can
+be provided as a list or as a path to a grp file. See slice_gctoo for the
+equivalent method to be used on GCToo objects.
 
 """
 import sys
@@ -12,12 +12,12 @@
 import sys
 import os
 import argparse
-import pandas as pd
-import re
 from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
-from cmapPy.pandasGEXpress import GCToo
+from cmapPy.set_io import grp
+from cmapPy.pandasGEXpress import slice_gctoo as sg
 from cmapPy.pandasGEXpress import parse_gct as pg
 from cmapPy.pandasGEXpress import write_gct as wg
+from cmapPy.pandasGEXpress import write_gctx as wgx
 
 __author__ = "Lev Litichevskiy"
 __email__ = "lev@broadinstitute.org"
@@ -40,14 +40,16 @@ def build_parser():
     parser.add_argument("--exclude_cid", "-ec", nargs="+", help="filepath to grp file or string array for excluding cols")
     parser.add_argument("--out_name", "-o", default="ds_sliced.gct",
                         help="what to name the output file")
+    parser.add_argument("--use_gctx", action="store_true", default=False,
+                        help="whether to write output as GCTx")
     parser.add_argument("--verbose", "-v", action="store_true", default=False,
                         help="whether to increase the # of messages reported")
 
     return parser
 
 
 def main():
-    # get args
+    # Get args
     args = build_parser().parse_args(sys.argv[1:])
     setup_logger.setup(verbose=args.verbose)
 
@@ -61,26 +63,20 @@ def main():
     exclude_cid = _read_arg(args.exclude_cid)
 
     # Slice the gct
-    out_gct = slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid)
+    out_gct = sg.slice_gctoo(in_gct, rid=rid, cid=cid, exclude_rid=exclude_rid, exclude_cid=exclude_cid)
     assert out_gct.data_df.size > 0, "Slicing yielded an empty gct!"
 
     # Write the output gct
-    wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
-
-
-def read_grp(in_path):
-    """ Read .grp file to a list. """
-
-    with open(in_path, 'r') as f:
-            lines = f.readlines()
-            # second conditional ignores comment lines
-            return [line.strip() for line in lines if line and not re.match('^#', line)]
+    if args.use_gctx:
+        wgx.write(out_gct, args.out_name)
+    else:
+        wg.write(out_gct, args.out_name, data_null="NaN", metadata_null="NA", filler_null="NA")
 
 
 def _read_arg(arg):
     """
     If arg is a list with 1 element that corresponds to a valid file path, use
-    plategrp to read the grp file. Otherwise, check that arg is a list of strings.
+    set_io.grp to read the grp file. Otherwise, check that arg is a list of strings.
 
     Args:
         arg (list or None)
@@ -96,7 +92,7 @@ def _read_arg(arg):
     else:
         # If len(arg) == 1 and arg[0] is a valid filepath, read it as a grp file
         if len(arg) == 1 and os.path.exists(arg[0]):
-            arg_out = read_grp(arg[0])
+            arg_out = grp.read(arg[0])
         else:
             arg_out = arg
 
@@ -107,87 +103,5 @@ def _read_arg(arg):
     return arg_out
 
 
-def slice_gctoo(gctoo, row_bool=None, col_bool=None, rid=None, cid=None, exclude_rid=None, exclude_cid=None):
-    """ Extract a subset of data from a GCToo object in a variety of ways.
-
-    Args:
-        gctoo (GCToo object)
-        row_bool (list of bools): length must equal gctoo.data_df.shape[0]
-        col_bool (list of bools): length must equal gctoo.data_df.shape[1]
-        rid (list of strings): length must equal gctoo.data_df.shape[0]
-        cid (list of strings): length must equal gctoo.data_df.shape[0]
-        exclude_rid (bool): if true, select row ids EXCLUDING 'rid' (default: False)
-        exclude_cid (bool): if true, select col ids EXCLUDING 'cid' (default: False)
-
-    Returns:
-        out_gctoo (GCToo object): gctoo after slicing
-    """
-    assert (rid is None) or (row_bool is None), (
-        "rid and row_bool should not BOTH be provided.")
-    assert (cid is None) or (col_bool is None), (
-        "cid and col_bool should not BOTH be provided.")
-
-    ### ROWS
-    # Use rid if provided
-    if rid is not None:
-        rows_to_keep = [gctoo_row for gctoo_row in gctoo.data_df.index if gctoo_row in rid]
-
-    else:
-        # Use row_bool if provided
-        if row_bool is not None:
-
-            assert len(row_bool) == gctoo.data_df.shape[0], (
-                "row_bool must have length equal to gctoo.data_df.shape[0]. " +
-                "len(row_bool): {}, gctoo.data_df.shape[0]: {}".format(
-                    len(row_bool), gctoo.data_df.shape[0]))
-            rows_to_keep = gctoo.data_df.index[row_bool].values
-
-        else:
-            # If rid and row_bool are both None, return all rows
-            rows_to_keep = gctoo.data_df.index.values
-
-    # Use exclude_rid if provided
-    if exclude_rid is not None:
-        # Keep only those rows that are not in exclude_rid
-        rows_to_keep = [row_to_keep for row_to_keep in rows_to_keep if row_to_keep not in exclude_rid]
-
-    ### COLUMNS
-    # Use cid if provided
-    if cid is not None:
-        cid = pd.Series(cid)
-        cols_to_keep = cid[cid.isin(gctoo.data_df.columns)]
-    else:
-        # Use col_bool if provided
-        if col_bool is not None:
-
-            assert len(col_bool) == gctoo.data_df.shape[1], (
-                "col_bool must have length equal to gctoo.data_df.shape[1]. " +
-                "len(col_bool): {}, gctoo.data_df.shape[1]: {}".format(
-                    len(col_bool), gctoo.data_df.shape[1]))
-            cols_to_keep = gctoo.data_df.columns[col_bool].values
-
-        else:
-            # If cid and col_bool are both None, return all cols
-            cols_to_keep = gctoo.data_df.columns.values
-
-    # Use exclude_cid if provided
-    if exclude_cid is not None:
-        # Keep only those cols that are not in exclude_cid
-        cols_to_keep = [col_to_keep for col_to_keep in cols_to_keep if col_to_keep not in exclude_cid]
-
-    # Convert labels to boolean array
-    rows_to_keep_bools = gctoo.data_df.index.isin(rows_to_keep)
-    cols_to_keep_bools = gctoo.data_df.columns.isin(cols_to_keep)
-
-    # Make the output gct
-    out_gctoo = GCToo.GCToo(
-        src= gctoo.src, version = gctoo.version,
-        data_df=gctoo.data_df.loc[rows_to_keep_bools, cols_to_keep_bools],
-        row_metadata_df=gctoo.row_metadata_df.loc[rows_to_keep_bools, :],
-        col_metadata_df=gctoo.col_metadata_df.loc[cols_to_keep_bools, :])
-
-    return out_gctoo
-
-
 if __name__ == "__main__":
     main()
diff --git a/cmapPy/pandasGEXpress/slice_gctoo.py b/cmapPy/pandasGEXpress/slice_gctoo.py
@@ -0,0 +1,105 @@
+"""
+slice_gctoo.py
+
+Extract a subset of data from a GCToo object using ids or boolean arrays.
+See slice_gct.py for the command line equivalent.
+
+"""
+import sys
+sys.path.insert(0, "../..")
+import logging
+import pandas as pd
+from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
+from cmapPy.pandasGEXpress import GCToo
+
+__author__ = "Lev Litichevskiy"
+__email__ = "lev@broadinstitute.org"
+
+logger = logging.getLogger(setup_logger.LOGGER_NAME)
+
+
+def slice_gctoo(gctoo, row_bool=None, col_bool=None, rid=None, cid=None, exclude_rid=None, exclude_cid=None):
+    """ Extract a subset of data from a GCToo object in a variety of ways.
+
+    Args:
+        gctoo (GCToo object)
+        row_bool (list of bools): length must equal gctoo.data_df.shape[0]
+        col_bool (list of bools): length must equal gctoo.data_df.shape[1]
+        rid (list of strings): length must equal gctoo.data_df.shape[0]
+        cid (list of strings): length must equal gctoo.data_df.shape[0]
+        exclude_rid (bool): if true, select row ids EXCLUDING 'rid' (default: False)
+        exclude_cid (bool): if true, select col ids EXCLUDING 'cid' (default: False)
+
+    Returns:
+        out_gctoo (GCToo object): gctoo after slicing
+    """
+    assert (rid is None) or (row_bool is None), (
+        "rid and row_bool should not BOTH be provided.")
+    assert (cid is None) or (col_bool is None), (
+        "cid and col_bool should not BOTH be provided.")
+
+    ### ROWS
+    # Use rid if provided
+    if rid is not None:
+        rows_to_keep = [gctoo_row for gctoo_row in gctoo.data_df.index if gctoo_row in rid]
+
+    else:
+        # Use row_bool if provided
+        if row_bool is not None:
+
+            assert len(row_bool) == gctoo.data_df.shape[0], (
+                "row_bool must have length equal to gctoo.data_df.shape[0]. " +
+                "len(row_bool): {}, gctoo.data_df.shape[0]: {}".format(
+                    len(row_bool), gctoo.data_df.shape[0]))
+            rows_to_keep = gctoo.data_df.index[row_bool].values
+
+        else:
+            # If rid and row_bool are both None, return all rows
+            rows_to_keep = gctoo.data_df.index.values
+
+    # Use exclude_rid if provided
+    if exclude_rid is not None:
+        # Keep only those rows that are not in exclude_rid
+        rows_to_keep = [row_to_keep for row_to_keep in rows_to_keep if row_to_keep not in exclude_rid]
+
+    ### COLUMNS
+    # Use cid if provided
+    if cid is not None:
+        cid = pd.Series(cid)
+        cols_to_keep = cid[cid.isin(gctoo.data_df.columns)]
+    else:
+        # Use col_bool if provided
+        if col_bool is not None:
+
+            assert len(col_bool) == gctoo.data_df.shape[1], (
+                "col_bool must have length equal to gctoo.data_df.shape[1]. " +
+                "len(col_bool): {}, gctoo.data_df.shape[1]: {}".format(
+                    len(col_bool), gctoo.data_df.shape[1]))
+            cols_to_keep = gctoo.data_df.columns[col_bool].values
+
+        else:
+            # If cid and col_bool are both None, return all cols
+            cols_to_keep = gctoo.data_df.columns.values
+
+    # Use exclude_cid if provided
+    if exclude_cid is not None:
+        # Keep only those cols that are not in exclude_cid
+        cols_to_keep = [col_to_keep for col_to_keep in cols_to_keep if col_to_keep not in exclude_cid]
+
+    # Convert labels to boolean array
+    rows_to_keep_bools = gctoo.data_df.index.isin(rows_to_keep)
+    cols_to_keep_bools = gctoo.data_df.columns.isin(cols_to_keep)
+
+    # Make the output gct
+    out_gctoo = GCToo.GCToo(
+        src=gctoo.src, version=gctoo.version,
+        data_df=gctoo.data_df.loc[rows_to_keep_bools, cols_to_keep_bools],
+        row_metadata_df=gctoo.row_metadata_df.loc[rows_to_keep_bools, :],
+        col_metadata_df=gctoo.col_metadata_df.loc[cols_to_keep_bools, :])
+
+    logger.info(("Initial GCToo with {} rows and {} columns sliced down to " +
+                 "{} rows and {} columns.").format(
+                      gctoo.data_df.shape[0], gctoo.data_df.shape[1],
+                      out_gctoo.data_df.shape[0], out_gctoo.data_df.shape[1]))
+
+    return out_gctoo
diff --git a/cmapPy/pandasGEXpress/tests/functional_tests/test_slice_rid.grp b/cmapPy/pandasGEXpress/tests/functional_tests/test_slice_rid.grp
@@ -1,4 +1,4 @@
-# used by test_ds_slice
+# used by test_slice_gct
 a
 Bb
 c
diff --git a/cmapPy/pandasGEXpress/tests/test_parse.py b/cmapPy/pandasGEXpress/tests/test_parse.py
@@ -3,14 +3,9 @@
 import logging
 from cmapPy.pandasGEXpress import setup_GCToo_logger as setup_logger
 import unittest
-import pandas as pd 
 import pandas.util.testing as pandas_testing
 from cmapPy.pandasGEXpress import parse 
-from cmapPy.pandasGEXpress import mini_gctoo_for_testing as mini_gctoo_for_testing
-from cmapPy.pandasGEXpress import slice_gct as slice_gct
-
-from cmapPy.pandasGEXpress import GCToo as GCToo 
-from cmapPy.pandasGEXpress import parse_gctx as parse_gctx
+from cmapPy.pandasGEXpress import slice_gctoo as slice_gctoo
 from cmapPy.pandasGEXpress import mini_gctoo_for_testing as mini_gctoo_for_testing
 
 __author__ = "Oana Enache"
@@ -40,15 +35,15 @@ def test_gctx_parsing(self):
         # parsing w/rids & cids specified 
         test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
         test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
-        mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
+        mg3 = slice_gctoo.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
         mg4 = parse("functional_tests/mini_gctoo_for_testing.gctx",
                                rid=test_rids, cid=test_cids)
         pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
         pandas_testing.assert_frame_equal(mg3.row_metadata_df, mg4.row_metadata_df)
         pandas_testing.assert_frame_equal(mg3.col_metadata_df, mg4.col_metadata_df)
 
         # parsing w/ridx & cidx specified 
-        mg5 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
+        mg5 = slice_gctoo.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                                     cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
         mg6 = parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])
 
diff --git a/cmapPy/pandasGEXpress/tests/test_parse_gctx.py b/cmapPy/pandasGEXpress/tests/test_parse_gctx.py
@@ -10,7 +10,7 @@
 from cmapPy.pandasGEXpress import GCToo as GCToo 
 from cmapPy.pandasGEXpress import parse_gctx as parse_gctx
 from cmapPy.pandasGEXpress import mini_gctoo_for_testing as mini_gctoo_for_testing
-from cmapPy.pandasGEXpress import slice_gct as slice_gct
+from cmapPy.pandasGEXpress import slice_gctoo as slice_gctoo
 from cmapPy.pandasGEXpress import write_gctx as write_gctx
 import pandas.util.testing as pandas_testing
 from six.moves import range
@@ -55,7 +55,7 @@ def test_parse(self):
         # test with string rid/cid
         test_rids = ['LJP007_MCF10A_24H:TRT_CP:BRD-K93918653:3.33', 'LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666']
         test_cids = ['LJP007_MCF7_24H:TRT_POSCON:BRD-A61304759:10']
-        mg3 = slice_gct.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
+        mg3 = slice_gctoo.slice_gctoo(mg1, rid=test_rids, cid=test_cids)
         mg4 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx",
                                rid=test_rids, cid=test_cids)
         pandas_testing.assert_frame_equal(mg3.data_df, mg4.data_df)
@@ -82,7 +82,7 @@ def test_parse(self):
         # test with numeric (repr as string) rid/cid
         mg5 = GCToo.GCToo(data_df=int_indexed_data_df, row_metadata_df=int_indexed_row_meta,
                           col_metadata_df=int_indexed_col_meta)
-        mg5 = slice_gct.slice_gctoo(mg5, row_bool=[True, False, True, False, True, False],
+        mg5 = slice_gctoo.slice_gctoo(mg5, row_bool=[True, False, True, False, True, False],
                                     col_bool=[True, False, False, True, True, True])
 
         mg5.data_df.index.name = "rid"
@@ -104,7 +104,7 @@ def test_parse(self):
         pandas_testing.assert_frame_equal(mg5.col_metadata_df, mg6.col_metadata_df)
 
         # test with ridx/cidx
-        mg7 = slice_gct.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
+        mg7 = slice_gctoo.slice_gctoo(mg1, rid=['LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666'],
                                     cid='LJP007_MCF7_24H:CTL_VEHICLE:DMSO:-666')
         mg8 = parse_gctx.parse("functional_tests/mini_gctoo_for_testing.gctx", ridx=[4], cidx=[4])
 
diff --git a/cmapPy/pandasGEXpress/tests/test_slice_gct.py b/cmapPy/pandasGEXpress/tests/test_slice_gct.py
diff --git a/cmapPy/pandasGEXpress/tests/test_slice_gctoo.py b/cmapPy/pandasGEXpress/tests/test_slice_gctoo.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-# used by test_ds_slice`
	`1`	`+# used by test_slice_gct`
`2`	`2`	`a`
`3`	`3`	`Bb`
`4`	`4`	`c`