Skip to content

Commit 6d376d7

Browse files
authored
Merge pull request #31 from cmap/consistent_parse_behavior
parse.py to provide exactly the same arguments to parse_gct as to parse_gctx
2 parents 23b83eb + d5b3687 commit 6d376d7

32 files changed

+1322
-1457
lines changed

cmapPy/pandasGEXpress/README.rst

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@ pandasGEXpress library
22
======================
33

44
This is a package of Python scripts that enable reading, writing, and
5-
basic modifications (slicing, concatenation) of .gct and .gctx files.
5+
basic modifications (subsetting, concatenation) of .gct and .gctx files.
66

77
Installation instructions and documentation can be found `on the package's ReadTheDocs page <https://clue.io/cmapPy/index.html>`_.
88

cmapPy/pandasGEXpress/concat.py

Lines changed: 556 additions & 0 deletions
Large diffs are not rendered by default.

cmapPy/pandasGEXpress/concat_gctoo.py

100755100644
Lines changed: 2 additions & 549 deletions
Large diffs are not rendered by default.

cmapPy/pandasGEXpress/gct2gctx.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -10,6 +10,7 @@
1010
import sys
1111
import logging
1212
import argparse
13+
import os.path
1314
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
1415
import cmapPy.pandasGEXpress.parse_gct as parse_gct
1516
import cmapPy.pandasGEXpress.write_gctx as write_gctx
@@ -24,11 +25,12 @@ def build_parser():
2425
parser = argparse.ArgumentParser(description=__doc__,
2526
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
2627
# required
27-
parser.add_argument("-filename",
28-
help=".gct file that you would like converted to .gctx form")
28+
parser.add_argument("-filename", "-f", required=True,
29+
help=".gct file that you would like to convert to .gctx")
2930
# optional
30-
parser.add_argument("-output_filepath",
31-
help="(optional) out path/name for output gctx file", default=None)
31+
parser.add_argument("-output_filepath", "-o", default=None,
32+
help=("out path/name for output gctx file. " +
33+
"Default is just to modify the extension"))
3234
parser.add_argument("-verbose", "-v",
3335
help="Whether to print a bunch of output.", action="store_true", default=False)
3436
return parser
@@ -37,11 +39,17 @@ def build_parser():
3739
def main():
3840
args = build_parser().parse_args(sys.argv[1:])
3941
setup_logger.setup(verbose=args.verbose)
42+
gct2gctx_main(args)
43+
44+
45+
def gct2gctx_main(args):
46+
""" Separate from main() in order to make command-line tool. """
47+
4048
in_gctoo = parse_gct.parse(args.filename, convert_neg_666=False)
41-
logger.debug("Original out name: {}".format(in_gctoo.src))
4249

43-
if args.output_filepath == None:
44-
out_name = str.split(in_gctoo.src, "/")[-1].split(".")[0]
50+
if args.output_filepath is None:
51+
basename = os.path.basename(args.filename)
52+
out_name = os.path.splitext(basename)[0] + ".gctx"
4553
else:
4654
out_name = args.output_filepath
4755

cmapPy/pandasGEXpress/gctx2gct.py

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
Command-line script to convert a .gctx file to .gct.
33
44
Main method takes in a .gctx file path (and, optionally, an
5-
out path and/or name to which to save the equivalent .gctx)
5+
out path and/or name to which to save the equivalent .gct)
66
and saves the enclosed content to a .gct file.
77
88
Note: Only supports v1.0 .gctx files.
@@ -25,12 +25,12 @@ def build_parser():
2525
parser = argparse.ArgumentParser(description=__doc__,
2626
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
2727
# required
28-
parser.add_argument("-filename", "-f",
29-
help=".gctx file that you would like converted to .gct form", required=True)
28+
parser.add_argument("-filename", "-f", required=True,
29+
help=".gctx file that you would like to converted to .gct")
3030
# optional
31-
parser.add_argument("-output_filepath",
32-
help="(optional) out path/name for output gctx file. Default will be the same as input but with extension changed from gctx to gct",
33-
default=None)
31+
parser.add_argument("-output_filepath", "-o", default=None,
32+
help=("out path/name for output gct file. " +
33+
"Default is just to modify the extension"))
3434
parser.add_argument("-verbose", "-v",
3535
help="Whether to print a bunch of output.", action="store_true", default=False)
3636
return parser
@@ -39,10 +39,17 @@ def build_parser():
3939
def main():
4040
args = build_parser().parse_args(sys.argv[1:])
4141
setup_logger.setup(verbose=args.verbose)
42+
gctx2gct_main(args)
43+
44+
45+
def gctx2gct_main(args):
46+
""" Separate from main() in order to make command-line tool. """
47+
4248
in_gctoo = parse_gctx.parse(args.filename, convert_neg_666=False)
43-
if args.output_filepath == None:
49+
50+
if args.output_filepath is None:
4451
basename = os.path.basename(args.filename)
45-
out_name = ".".join(basename.split(".")[:-1])
52+
out_name = os.path.splitext(basename)[0] + ".gct"
4653
else:
4754
out_name = args.output_filepath
4855

cmapPy/pandasGEXpress/parse.py

Lines changed: 23 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -29,19 +29,25 @@ def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=N
2929
- gct(x)_file_path (str): full path to gct(x) file you want to parse.
3030
3131
Optional:
32-
- row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
33-
as pandas DataFrame
34-
- col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
35-
as pandas DataFrame
3632
- convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
3733
(see Note below for more details on this). Default = False.
3834
- rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
3935
- cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
36+
- ridx (list of integers): only read the rows corresponding to this
37+
list of integer ids. Default=None.
38+
- cidx (list of integers): only read the columns corresponding to this
39+
list of integer ids. Default=None.
40+
- row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
41+
as pandas DataFrame
42+
- col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
43+
as pandas DataFrame
4044
- make_multiindex (bool): whether to create a multi-index df combining
4145
the 3 component dfs
4246
4347
Output:
44-
- myGCToo (GCToo)
48+
- out (GCToo object or pandas df): if row_meta_only or col_meta_only, then
49+
out is a metadata df; otherwise, it's a GCToo instance containing
50+
content of parsed gct(x) file
4551
4652
Note: why does convert_neg_666 exist?
4753
- In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
@@ -50,19 +56,21 @@ def parse(file_path, convert_neg_666=True, rid=None, cid=None, ridx=None, cidx=N
5056
into numpy.NaN values, the pandas default.
5157
"""
5258
if file_path.endswith(".gct"):
53-
# Ignoring arguments that won't be passed to parse_gct
54-
for unused_arg in ["rid", "cid", "ridx", "cidx"]:
55-
if eval(unused_arg):
56-
err_msg = "parse_gct does not use the argument {}. Ignoring it...".format(unused_arg)
57-
logger.error(err_msg)
58-
raise Exception(err_msg)
59-
curr = parse_gct.parse(file_path, convert_neg_666, row_meta_only, col_meta_only, make_multiindex)
59+
out = parse_gct.parse(file_path, convert_neg_666=convert_neg_666,
60+
rid=rid, cid=cid, ridx=ridx, cidx=cidx,
61+
row_meta_only=row_meta_only, col_meta_only=col_meta_only,
62+
make_multiindex=make_multiindex)
63+
6064
elif file_path.endswith(".gctx"):
61-
curr = parse_gctx.parse(file_path, convert_neg_666, rid, cid, ridx, cidx, row_meta_only, col_meta_only,
62-
make_multiindex)
65+
out = parse_gctx.parse(file_path, convert_neg_666=convert_neg_666,
66+
rid=rid, cid=cid, ridx=ridx, cidx=cidx,
67+
row_meta_only=row_meta_only, col_meta_only=col_meta_only,
68+
make_multiindex=make_multiindex)
69+
6370
else:
6471
err_msg = "File to parse must be .gct or .gctx!"
6572
logger.error(err_msg)
6673
raise Exception(err_msg)
67-
return curr
74+
75+
return out
6876

cmapPy/pandasGEXpress/parse_gct.py

Lines changed: 37 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,7 @@
6363
import numpy as np
6464
import os.path
6565
import cmapPy.pandasGEXpress.GCToo as GCToo
66+
import cmapPy.pandasGEXpress.subset_gctoo as sg
6667
import cmapPy.pandasGEXpress.setup_GCToo_logger as setup_logger
6768

6869
__author__ = "Lev Litichevskiy, Oana Enache"
@@ -78,22 +79,32 @@
7879
DATA_TYPE = np.float32
7980

8081

81-
def parse(file_path, convert_neg_666=True, row_meta_only=False, col_meta_only=False, make_multiindex=False):
82-
""" The main method.
82+
def parse(file_path, convert_neg_666=True, rid=None, cid=None,
83+
ridx=None, cidx=None, row_meta_only=False, col_meta_only=False, make_multiindex=False):
84+
"""
85+
The main method.
8386
8487
Args:
8588
- file_path (string): full path to gct(x) file you want to parse
8689
- convert_neg_666 (bool): whether to convert -666 values to numpy.nan
87-
(see Note below for more details). Default = True.
88-
- row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
89-
as pandas DataFrame
90-
- col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
91-
as pandas DataFrame
90+
(see Note below for more details). Default = False.
91+
- rid (list of strings): list of row ids to specifically keep from gct. Default=None.
92+
- cid (list of strings): list of col ids to specifically keep from gct. Default=None.
93+
- ridx (list of integers): only read the rows corresponding to this
94+
list of integer ids. Default=None.
95+
- cidx (list of integers): only read the columns corresponding to this
96+
list of integer ids. Default=None.
97+
- row_meta_only (bool): Whether to load data + metadata (if False), or
98+
just row metadata (if True) as pandas DataFrame
99+
- col_meta_only (bool): Whether to load data + metadata (if False), or
100+
just col metadata (if True) as pandas DataFrame
92101
- make_multiindex (bool): whether to create a multi-index df combining
93102
the 3 component dfs
94103
95104
Returns:
96-
gctoo_obj (GCToo object)
105+
- myGCToo (GCToo object): A GCToo instance containing content of
106+
parsed gct file ** OR **
107+
- row_metadata (pandas df) ** OR ** col_metadata (pandas df)
97108
98109
Note: why is convert_neg_666 even a thing?
99110
In CMap--for somewhat obscure historical reasons--we use "-666" as our null value
@@ -102,6 +113,9 @@ def parse(file_path, convert_neg_666=True, row_meta_only=False, col_meta_only=Fa
102113
into numpy.nan values, the pandas default.
103114
104115
"""
116+
assert sum([row_meta_only, col_meta_only]) <= 1, (
117+
"row_meta_only and col_meta_only cannot both be requested.")
118+
105119
nan_values = [
106120
"#N/A", "N/A", "NA", "#NA", "NULL", "NaN", "-NaN",
107121
"nan", "-nan", "#N/A!", "na", "NA", "None", "#VALUE!"]
@@ -126,16 +140,24 @@ def parse(file_path, convert_neg_666=True, row_meta_only=False, col_meta_only=Fa
126140
file_path, num_data_rows, num_data_cols,
127141
num_row_metadata, num_col_metadata, nan_values)
128142

143+
# Create the gctoo object and assemble 3 component dataframes
144+
# Not the most efficient if only metadata requested (i.e. creating the
145+
# whole GCToo just to return the metadata df), but simplest
146+
myGCToo = create_gctoo_obj(file_path, version, row_metadata, col_metadata,
147+
data, make_multiindex)
148+
# Subset if requested
149+
if (rid is not None) or (ridx is not None) or (cid is not None) or (cidx is not None):
150+
logger.info("Subsetting GCT... (note that there are no speed gains when subsetting GCTs)")
151+
myGCToo = sg.subset_gctoo(myGCToo, rid=rid, cid=cid, ridx=ridx, cidx=cidx)
152+
129153
if row_meta_only:
130-
return row_metadata
154+
return myGCToo.row_metadata_df
155+
131156
elif col_meta_only:
132-
return col_metadata
133-
else:
134-
# Create the gctoo object and assemble 3 component dataframes
135-
gctoo_obj = create_gctoo_obj(file_path, version,
136-
row_metadata, col_metadata, data, make_multiindex)
157+
return myGCToo.col_metadata_df
137158

138-
return gctoo_obj
159+
else:
160+
return myGCToo
139161

140162

141163
def read_version_and_dims(file_path):

cmapPy/pandasGEXpress/parse_gctx.py

Lines changed: 21 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,12 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
3232
Optional:
3333
- convert_neg_666 (bool): whether to convert -666 values to numpy.nan or not
3434
(see Note below for more details on this). Default = False.
35-
- rid (list of strings): only read the row ids in this list from the gctx. Default=None.
36-
- cid (list of strings): only read the column ids in this list from the gctx. Default=None.
35+
- rid (list of strings): list of row ids to specifically keep from gctx. Default=None.
36+
- cid (list of strings): list of col ids to specifically keep from gctx. Default=None.
37+
- ridx (list of integers): only read the rows corresponding to this
38+
list of integer ids. Default=None.
39+
- cidx (list of integers): only read the columns corresponding to this
40+
list of integer ids. Default=None.
3741
- row_meta_only (bool): Whether to load data + metadata (if False), or just row metadata (if True)
3842
as pandas DataFrame
3943
- col_meta_only (bool): Whether to load data + metadata (if False), or just col metadata (if True)
@@ -74,7 +78,7 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
7478

7579
gctx_file.close()
7680

77-
# slice if specified, then return
81+
# subset if specified, then return
7882
row_meta = row_meta.iloc[sorted_ridx]
7983
return row_meta
8084
elif col_meta_only:
@@ -87,7 +91,7 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
8791

8892
gctx_file.close()
8993

90-
# slice if specified, then return
94+
# subset if specified, then return
9195
col_meta = col_meta.iloc[sorted_cidx]
9296
return col_meta
9397
else:
@@ -105,7 +109,7 @@ def parse(gctx_file_path, convert_neg_666=True, rid=None, cid=None,
105109
data_dset = gctx_file[data_node]
106110
data_df = parse_data_df(data_dset, sorted_ridx, sorted_cidx, row_meta, col_meta)
107111

108-
# (if slicing) slice metadata
112+
# (if subsetting) subset metadata
109113
row_meta = row_meta.iloc[sorted_ridx]
110114
col_meta = col_meta.iloc[sorted_cidx]
111115

@@ -146,7 +150,7 @@ def check_and_order_id_inputs(rid, ridx, cid, cidx, row_meta_df, col_meta_df):
146150

147151
def check_id_idx_exclusivity(id, idx):
148152
"""
149-
Makes sure user didn't provide both ids and idx values to slice by.
153+
Makes sure user didn't provide both ids and idx values to subset by.
150154
151155
Input:
152156
- id (list or None): if not None, a list of string id names
@@ -157,7 +161,7 @@ def check_id_idx_exclusivity(id, idx):
157161
"""
158162
if (id is not None and idx is not None):
159163
msg = ("'id' and 'idx' fields can't both not be None," +
160-
" please specify slice in only one of these fields")
164+
" please specify subset in only one of these fields")
161165
logger.error(msg)
162166
raise Exception("parse_gctx.check_id_idx_exclusivity: " + msg)
163167
elif id is not None:
@@ -312,27 +316,27 @@ def set_metadata_index_and_column_names(dim, meta_df):
312316

313317
def parse_data_df(data_dset, ridx, cidx, row_meta, col_meta):
314318
"""
315-
Parses in data_df from hdf5, slicing if specified.
319+
Parses in data_df from hdf5, subsetting if specified.
316320
317321
Input:
318322
-data_dset (h5py dset): HDF5 dataset from which to read data_df
319-
-ridx (list): list of indexes to slice from data_df
320-
(may be all of them if no slicing)
321-
-cidx (list): list of indexes to slice from data_df
322-
(may be all of them if no slicing)
323+
-ridx (list): list of indexes to subset from data_df
324+
(may be all of them if no subsetting)
325+
-cidx (list): list of indexes to subset from data_df
326+
(may be all of them if no subsetting)
323327
-row_meta (pandas DataFrame): the parsed in row metadata
324328
-col_meta (pandas DataFrame): the parsed in col metadata
325329
"""
326-
if len(ridx) == len(row_meta.index) and len(cidx) == len(col_meta.index): # no slice
330+
if len(ridx) == len(row_meta.index) and len(cidx) == len(col_meta.index): # no subset
327331
data_array = np.empty(data_dset.shape, dtype=np.float32)
328332
data_dset.read_direct(data_array)
329333
data_array = data_array.transpose()
330334
elif len(ridx) <= len(cidx):
331-
first_slice = data_dset[:, ridx].astype(np.float32)
332-
data_array = first_slice[cidx, :].transpose()
335+
first_subset = data_dset[:, ridx].astype(np.float32)
336+
data_array = first_subset[cidx, :].transpose()
333337
elif len(cidx) < len(ridx):
334-
first_slice = data_dset[cidx, :].astype(np.float32)
335-
data_array = first_slice[:, ridx].transpose()
338+
first_subset = data_dset[cidx, :].astype(np.float32)
339+
data_array = first_subset[:, ridx].transpose()
336340
# make DataFrame instance
337341
data_df = pd.DataFrame(data_array, index=row_meta.index[ridx], columns=col_meta.index[cidx])
338342
return data_df

0 commit comments

Comments
 (0)