cmap
diff --git a/‎cmapPy/pandasGEXpress/README.rst‎
Lines changed: 1 addition & 1 deletion b/‎cmapPy/pandasGEXpress/README.rst‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎cmapPy/pandasGEXpress/concat.py‎
Lines changed: 556 additions & 0 deletions b/‎cmapPy/pandasGEXpress/concat.py‎
Lines changed: 556 additions & 0 deletions
diff --git a/‎cmapPy/pandasGEXpress/concat_gctoo.py‎
100755100644
Lines changed: 2 additions & 549 deletions b/‎cmapPy/pandasGEXpress/concat_gctoo.py‎
100755100644
Lines changed: 2 additions & 549 deletions
diff --git a/‎cmapPy/pandasGEXpress/gct2gctx.py‎
Lines changed: 15 additions & 7 deletions b/‎cmapPy/pandasGEXpress/gct2gctx.py‎
Lines changed: 15 additions & 7 deletions
diff --git a/‎cmapPy/pandasGEXpress/gctx2gct.py‎
Lines changed: 15 additions & 8 deletions b/‎cmapPy/pandasGEXpress/gctx2gct.py‎
Lines changed: 15 additions & 8 deletions
diff --git a/‎cmapPy/pandasGEXpress/parse.py‎
Lines changed: 23 additions & 15 deletions b/‎cmapPy/pandasGEXpress/parse.py‎
Lines changed: 23 additions & 15 deletions
diff --git a/‎cmapPy/pandasGEXpress/parse_gct.py‎
Lines changed: 37 additions & 15 deletions b/‎cmapPy/pandasGEXpress/parse_gct.py‎
Lines changed: 37 additions & 15 deletions
diff --git a/‎cmapPy/pandasGEXpress/parse_gctx.py‎
Lines changed: 21 additions & 17 deletions b/‎cmapPy/pandasGEXpress/parse_gctx.py‎
Lines changed: 21 additions & 17 deletions
@@ -2,7 +2,7 @@ pandasGEXpress library
 ======================
 
 This is a package of Python scripts that enable reading, writing, and
-basic modifications (slicing, concatenation) of .gct and .gctx files.
+basic modifications (subsetting, concatenation) of .gct and .gctx files.
 
 Installation instructions and documentation can be found  `on the package's ReadTheDocs page <https://clue.io/cmapPy/index.html>`_. 
 
 
@@ -10,6 +10,7 @@
 import sys
 import logging
 import argparse
+import os.path
 import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 import cmapPy.pandasGEXpress.parse_gct as parse_gct
 import cmapPy.pandasGEXpress.write_gctx as write_gctx
@@ -24,11 +25,12 @@ def build_parser():
     parser = argparse.ArgumentParser(description=__doc__,
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     # required
-    parser.add_argument("-filename",
-                        help=".gct file that you would like converted to .gctx form")
+    parser.add_argument("-filename", "-f", required=True,
+                        help=".gct file that you would like to convert to .gctx")
     # optional
-    parser.add_argument("-output_filepath",
-                        help="(optional) out path/name for output gctx file", default=None)
+    parser.add_argument("-output_filepath", "-o", default=None,
+                        help=("out path/name for output gctx file. " +
+                              "Default is just to modify the extension"))
     parser.add_argument("-verbose", "-v",
                         help="Whether to print a bunch of output.", action="store_true", default=False)
     return parser
@@ -37,11 +39,17 @@ def build_parser():
 def main():
     args = build_parser().parse_args(sys.argv[1:])
     setup_logger.setup(verbose=args.verbose)
+    gct2gctx_main(args)
+
+
+def gct2gctx_main(args):
+    """ Separate from main() in order to make command-line tool. """
+
     in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
-    logger.debug("Original out name: {}".format(in_gctoo.src))
 
-    if args.output_filepath == None:
-        out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
+    if args.output_filepath is None:
+        basename = os.path.basename(args.filename)
+        out_name = os.path.splitext(basename)[0] + ".gctx"
     else:
         out_name = args.output_filepath
 
 
@@ -2,7 +2,7 @@
 Command-line script to convert a .gctx file to .gct. 
 
 Main method takes in a .gctx file path (and, optionally, an 
-	out path and/or name to which to save the equivalent .gctx)
+	out path and/or name to which to save the equivalent .gct)
 	and saves the enclosed content to a .gct file. 
 
 Note: Only supports v1.0 .gctx files. 
@@ -25,12 +25,12 @@ def build_parser():
     parser = argparse.ArgumentParser(description=__doc__,
                                      formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     # required
-    parser.add_argument("-filename", "-f",
-                        help=".gctx file that you would like converted to .gct form", required=True)
+    parser.add_argument("-filename", "-f", required=True,
+                        help=".gctx file that you would like to converted to .gct")
     # optional
-    parser.add_argument("-output_filepath",
-                        help="(optional) out path/name for output gctx file.  Default will be the same as input but with extension changed from gctx to gct",
-                        default=None)
+    parser.add_argument("-output_filepath", "-o", default=None,
+                        help=("out path/name for output gct file. " +
+                              "Default is just to modify the extension"))
     parser.add_argument("-verbose", "-v",
                         help="Whether to print a bunch of output.", action="store_true", default=False)
     return parser
@@ -39,10 +39,17 @@ def build_parser():
 def main():
     args = build_parser().parse_args(sys.argv[1:])
     setup_logger.setup(verbose=args.verbose)
+    gctx2gct_main(args)
+
+
+def gctx2gct_main(args):
+    """ Separate from main() in order to make command-line tool. """
+
     in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)
-    if args.output_filepath == None:
+
+    if args.output_filepath is None:
         basename = os.path.basename(args.filename)
-        out_name = ".".join(basename.split(".")[:-1])
+        out_name = os.path.splitext(basename)[0] + ".gct"
     else:
         out_name = args.output_filepath
 
 
@@ -29,19 +29,25 @@ def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=N
         - gct(x)_file_path (str): full path to gct(x) file you want to parse.
 
         Optional:
-        - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
-            as pandas DataFrame
-        - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
-            as pandas DataFrame
         - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
             (see Note below for more details on this). Default = False.
         - rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
         - cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
+        - ridx (list of integers): only read the rows corresponding to this
+            list of integer ids. Default=None.
+        - cidx (list of integers): only read the columns corresponding to this
+            list of integer ids. Default=None.
+        - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
+            as pandas DataFrame
+        - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
+            as pandas DataFrame
         - make_multiindex (bool): whether to create a multi-index df combining
             the 3 component dfs
 
     Output:
-        - myGCToo (GCToo)
+        - out (GCToo object or pandas df): if row_meta_only or col_meta_only, then
+            out is a metadata df; otherwise, it's a GCToo instance containing
+            content of parsed gct(x) file
 
     Note: why does convert_neg_666 exist?
         - In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
@@ -50,19 +56,21 @@ def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=N
         into numpy.NaN values, the pandas default.
     """
     if file_path.endswith(".gct"):
-        # Ignoring arguments that won't be passed to parse_gct
-        for unused_arg in ["rid", "cid", "ridx", "cidx"]:
-            if eval(unused_arg):
-                err_msg = "parse_gct does not use the argument {}. Ignoring it...".format(unused_arg)
-                logger.error(err_msg)
-                raise Exception(err_msg)
-        curr = parse_gct.parse(file_path, convert_neg_666, row_meta_only, col_meta_only, make_multiindex)
+        out = parse_gct.parse(file_path, convert_neg_666=convert_neg_666,
+                              rid=rid, cid=cid, ridx=ridx, cidx=cidx,
+                              row_meta_only=row_meta_only, col_meta_only=col_meta_only,
+                              make_multiindex=make_multiindex)
+
     elif file_path.endswith(".gctx"):
-        curr = parse_gctx.parse(file_path, convert_neg_666, rid, cid, ridx, cidx, row_meta_only, col_meta_only,
-                                make_multiindex)
+        out = parse_gctx.parse(file_path, convert_neg_666=convert_neg_666,
+                              rid=rid, cid=cid, ridx=ridx, cidx=cidx,
+                              row_meta_only=row_meta_only, col_meta_only=col_meta_only,
+                              make_multiindex=make_multiindex)
+
     else:
         err_msg = "File to parse must be .gct or .gctx!"
         logger.error(err_msg)
         raise Exception(err_msg)
-    return curr
+
+    return out
 
@@ -63,6 +63,7 @@
 import numpy as np
 import os.path
 import cmapPy.pandasGEXpress.GCToo as GCToo
+import cmapPy.pandasGEXpress.subset_gctoo as sg
 import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
 
 __author__ = "Lev Litichevskiy, Oana Enache"
@@ -78,22 +79,32 @@
 DATA_TYPE = np.float32
 
 
-def parse(file_path, convert_neg_666=True, row_meta_only=False, col_meta_only=False, make_multiindex=False):
-    """ The main method.
+def parse(file_path, convert_neg_666=True, rid=None, cid=None,
+          ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False):
+    """
+    The main method.
 
     Args:
         - file_path (string): full path to gct(x) file you want to parse
         - convert_neg_666 (bool): whether to convert -666 values to numpy.nan
-            (see Note below for more details). Default = True.
-        - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
-            as pandas DataFrame
-        - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
-            as pandas DataFrame
+            (see Note below for more details). Default = False.
+        - rid (list of strings): list of row ids to specifically keep from gct. Default=None.
+        - cid (list of strings): list of col ids to specifically keep from gct. Default=None.
+        - ridx (list of integers): only read the rows corresponding to this
+            list of integer ids. Default=None.
+        - cidx (list of integers): only read the columns corresponding to this
+            list of integer ids. Default=None.
+        - row_meta_only (bool): Whether to load data + metadata (if False), or
+            just row metadata (if True) as pandas DataFrame
+        - col_meta_only (bool): Whether to load data + metadata (if False), or
+            just col metadata (if True) as pandas DataFrame
         - make_multiindex (bool): whether to create a multi-index df combining
             the 3 component dfs
 
     Returns:
-        gctoo_obj (GCToo object)
+        - myGCToo (GCToo object): A GCToo instance containing content of
+            parsed gct file ** OR **
+        - row_metadata (pandas df) ** OR ** col_metadata (pandas df)
 
     Note: why is convert_neg_666 even a thing?
         In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
@@ -102,6 +113,9 @@ def parse(file_path, convert_neg_666=True, row_meta_only=False, col_meta_only=Fa
         into numpy.nan values, the pandas default.
 
     """
+    assert sum([row_meta_only, col_meta_only]) <= 1, (
+        "row_meta_only and col_meta_only cannot both be requested.")
+
     nan_values = [
         "#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN",
         "nan", "-nan", "#N/A!", "na", "NA", "None", "#VALUE!"]
@@ -126,16 +140,24 @@ def parse(file_path, convert_neg_666=True, row_meta_only=False, col_meta_only=Fa
         file_path, num_data_rows, num_data_cols,
         num_row_metadata, num_col_metadata, nan_values)
 
+    # Create the gctoo object and assemble 3 component dataframes
+    # Not the most efficient if only metadata requested (i.e. creating the
+    # whole GCToo just to return the metadata df), but simplest
+    myGCToo = create_gctoo_obj(file_path, version, row_metadata, col_metadata,
+                               data, make_multiindex)
+    # Subset if requested
+    if (rid is not None) or (ridx is not None) or (cid is not None) or (cidx is not None):
+        logger.info("Subsetting GCT... (note that there are no speed gains when subsetting GCTs)")
+        myGCToo = sg.subset_gctoo(myGCToo, rid=rid, cid=cid, ridx=ridx, cidx=cidx)
+
     if row_meta_only:
-        return row_metadata
+        return myGCToo.row_metadata_df
+
     elif col_meta_only:
-        return col_metadata
-    else:
-        # Create the gctoo object and assemble 3 component dataframes
-        gctoo_obj = create_gctoo_obj(file_path, version,
-            row_metadata, col_metadata, data, make_multiindex)
+        return myGCToo.col_metadata_df
 
-        return gctoo_obj
+    else:
+        return myGCToo
 
 
 def read_version_and_dims(file_path):
 
@@ -32,8 +32,12 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
         Optional:
         - convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
             (see Note below for more details on this). Default = False.
-        - rid (list of strings): only read the row ids in this list from the gctx. Default=None.
-        - cid (list of strings): only read the column ids in this list from the gctx. Default=None.
+        - rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
+        - cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
+        - ridx (list of integers): only read the rows corresponding to this
+            list of integer ids. Default=None.
+        - cidx (list of integers): only read the columns corresponding to this
+            list of integer ids. Default=None.
         - row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
             as pandas DataFrame
         - col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
@@ -74,7 +78,7 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
 
         gctx_file.close()
 
-        # slice if specified, then return
+        # subset if specified, then return
         row_meta = row_meta.iloc[sorted_ridx]
         return row_meta
     elif col_meta_only:
@@ -87,7 +91,7 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
 
         gctx_file.close()
 
-        # slice if specified, then return
+        # subset if specified, then return
         col_meta = col_meta.iloc[sorted_cidx]
         return col_meta
     else:
@@ -105,7 +109,7 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
         data_dset = gctx_file[data_node]
         data_df = parse_data_df(data_dset, sorted_ridx, sorted_cidx, row_meta, col_meta)
 
-        # (if slicing) slice metadata
+        # (if subsetting) subset metadata
         row_meta = row_meta.iloc[sorted_ridx]
         col_meta = col_meta.iloc[sorted_cidx]
 
@@ -146,7 +150,7 @@ def check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta_df, col_meta_df):
 
 def check_id_idx_exclusivity(id, idx):
     """
-    Makes sure user didn't provide both ids and idx values to slice by.
+    Makes sure user didn't provide both ids and idx values to subset by.
 
     Input:
         - id (list or None): if not None, a list of string id names
@@ -157,7 +161,7 @@ def check_id_idx_exclusivity(id, idx):
     """
     if (id is not None and idx is not None):
         msg = ("'id' and 'idx' fields can't both not be None," +
-               " please specify slice in only one of these fields")
+               " please specify subset in only one of these fields")
         logger.error(msg)
         raise Exception("parse_gctx.check_id_idx_exclusivity: " + msg)
     elif id is not None:
@@ -312,27 +316,27 @@ def set_metadata_index_and_column_names(dim, meta_df):
 
 def parse_data_df(data_dset, ridx, cidx, row_meta, col_meta):
     """
-    Parses in data_df from hdf5, slicing if specified.
+    Parses in data_df from hdf5, subsetting if specified.
 
     Input:
         -data_dset (h5py dset): HDF5 dataset from which to read data_df
-        -ridx (list): list of indexes to slice from data_df
-            (may be all of them if no slicing)
-        -cidx (list): list of indexes to slice from data_df
-            (may be all of them if no slicing)
+        -ridx (list): list of indexes to subset from data_df
+            (may be all of them if no subsetting)
+        -cidx (list): list of indexes to subset from data_df
+            (may be all of them if no subsetting)
         -row_meta (pandas DataFrame): the parsed in row metadata
         -col_meta (pandas DataFrame): the parsed in col metadata
     """
-    if len(ridx) == len(row_meta.index) and len(cidx) == len(col_meta.index):  # no slice
+    if len(ridx) == len(row_meta.index) and len(cidx) == len(col_meta.index):  # no subset
         data_array = np.empty(data_dset.shape, dtype=np.float32)
         data_dset.read_direct(data_array)
         data_array = data_array.transpose()
     elif len(ridx) <= len(cidx):
-        first_slice = data_dset[:, ridx].astype(np.float32)
-        data_array = first_slice[cidx, :].transpose()
+        first_subset = data_dset[:, ridx].astype(np.float32)
+        data_array = first_subset[cidx, :].transpose()
     elif len(cidx) < len(ridx):
-        first_slice = data_dset[cidx, :].astype(np.float32)
-        data_array = first_slice[:, ridx].transpose()
+        first_subset = data_dset[cidx, :].astype(np.float32)
+        data_array = first_subset[:, ridx].transpose()
     # make DataFrame instance
     data_df = pd.DataFrame(data_array, index=row_meta.index[ridx], columns=col_meta.index[cidx])
     return data_df