4
4
import time
5
5
import json
6
6
from collections import OrderedDict
7
- import pandas as pd
8
- import numpy as np
9
7
10
8
11
9
def parse_arguments ():
@@ -26,6 +24,7 @@ def parse_arguments():
26
24
help = 'Plain text list for drug feature filtering. one item per line' )
27
25
parser .add_argument ('--output' , type = str , default = 'topN.uno.h5' ,
28
26
help = 'output filename' )
27
+ parser .add_argument ('--show' , action = 'store_true' , help = 'Simply show the plan node' )
29
28
30
29
args , unparsed = parser .parse_known_args ()
31
30
return args , unparsed
@@ -36,11 +35,14 @@ def read_plan(filename, node):
36
35
with open (filename , 'r' ) as plan_file :
37
36
plan = json .load (plan_file )
38
37
if node is None :
39
- return plan
40
- if node in plan :
41
- return plan [node ]
38
+ result = plan
39
+ elif node in plan :
40
+ result = plan [node ]
42
41
else :
43
42
raise Exception ('Node index "{}" was not found in plan file' .format (node ))
43
+ print ("read_plan(): done." )
44
+ return result
45
+
44
46
45
47
class topN_NoDataException (Exception ):
46
48
pass
@@ -60,16 +62,16 @@ def build_masks(args, df):
60
62
61
63
for partition in ['train' , 'val' ]:
62
64
_mask = df ['Sample' ] == None # noqa Should keep == operator here. This is a pandas operation.
63
- for i in range (len (ids [partition ]['cell ' ])):
64
- if 'cell ' in ids [partition ] and 'drug ' in ids [partition ]:
65
- cl_filter = ids [partition ]['cell ' ][i ]
66
- dr_filter = ids [partition ]['drug ' ][i ]
67
- __mask = df ['Sample' ].isin (cl_filter ) & df ['Drug1 ' ].isin (dr_filter )
68
- elif 'cell ' in ids [partition ]:
69
- cl_filter = ids [partition ]['cell ' ][i ]
65
+ for i in range (len (ids [partition ]['CELL ' ])):
66
+ if 'CELL ' in ids [partition ] and 'DRUG ' in ids [partition ]:
67
+ cl_filter = ids [partition ]['CELL ' ][i ]
68
+ dr_filter = ids [partition ]['DRUG ' ][i ]
69
+ __mask = df ['Sample' ].isin (cl_filter ) & df ['DRUG1 ' ].isin (dr_filter )
70
+ elif 'CELL ' in ids [partition ]:
71
+ cl_filter = ids [partition ]['CELL ' ][i ]
70
72
__mask = df ['Sample' ].isin (cl_filter )
71
- elif 'drug ' in ids [partition ]:
72
- dr_filter = ids [partition ]['drug ' ][i ]
73
+ elif 'DRUG ' in ids [partition ]:
74
+ dr_filter = ids [partition ]['DRUG ' ][i ]
73
75
__mask = df ['Drug1' ].isin (dr_filter )
74
76
_mask = _mask | __mask
75
77
mask [partition ] = _mask
@@ -84,17 +86,18 @@ def build_masks_w_holdout(args, df):
84
86
85
87
print ('from new build_mask: {} {} {}' .format (args .plan , args .node , args .incremental ))
86
88
import plangen
87
- plan = read_plan (args .plan , None )
89
+ plan = read_plan (args .plan , args .node )
90
+
88
91
ids = {}
89
92
mask = {}
90
- # Dicts {'cell ': [[CCL_510, CCL_577, ...]]} :
93
+ # Dicts {'CELL ': [[CCL_510, CCL_577, ...]]} :
91
94
_ , _ , ids ['train' ], ids ['val' ] = plangen .get_subplan_features (plan , args .node , args .incremental )
92
95
if ids ['train' ] == None :
93
96
print ("topN: get_subplan_features() returned None!" )
94
97
raise topN_NoDataException ()
95
98
96
- print ("cell lines in plan for %s: ids train len: " % args .node +
97
- str (len (ids ['train' ]['cell ' ][0 ])))
99
+ print ("CELL lines in plan for %s: ids train len: " % args .node +
100
+ str (len (ids ['train' ]['CELL ' ][0 ])))
98
101
99
102
# holdout
100
103
from sklearn .model_selection import ShuffleSplit
@@ -124,19 +127,19 @@ def build_masks_w_holdout(args, df):
124
127
125
128
for partition in ['train' , 'val' ]:
126
129
_mask = df ['Sample' ] == None # noqa Should keep == operator here. This is a pandas operation.
127
- for i in range (len (ids [partition ]['cell ' ])):
130
+ for i in range (len (ids [partition ]['CELL ' ])):
128
131
print ("i: %i" % i )
129
132
130
- if 'cell ' in ids [partition ] and 'drug' in ids [partition ]:
133
+ if 'CELL ' in ids [partition ] and 'drug' in ids [partition ]:
131
134
print ("IF CD" )
132
- cl_filter = ids [partition ]['cell ' ][i ]
135
+ cl_filter = ids [partition ]['CELL ' ][i ]
133
136
dr_filter = ids [partition ]['drug' ][i ]
134
137
__mask = df_new ['Sample' ].isin (cl_filter ) & \
135
138
df_new ['Drug1' ].isin (dr_filter )
136
139
137
- elif 'cell ' in ids [partition ]:
140
+ elif 'CELL ' in ids [partition ]:
138
141
print ("IF C." )
139
- cl_filter = ids [partition ]['cell ' ][i ]
142
+ cl_filter = ids [partition ]['CELL ' ][i ]
140
143
__mask = df_new ['Sample' ].isin (cl_filter )
141
144
elif 'drug' in ids [partition ]:
142
145
print ("IF D." )
@@ -148,11 +151,14 @@ def build_masks_w_holdout(args, df):
148
151
149
152
150
153
def get_random_mask (df ):
154
+ import numpy as np
151
155
return np .random .rand (len (df )) < 0.8
152
156
153
157
154
158
def read_dataframe (args ):
155
159
print ("in read_dataframe" ) ; sys .stdout .flush ()
160
+ import pandas as pd
161
+
156
162
_ , ext = os .path .splitext (args .dataframe_from )
157
163
if ext == '.h5' or ext == '.hdf5' :
158
164
print ("HDFStore r " + str (args .dataframe_from ))
@@ -198,6 +204,7 @@ def read_dataframe(args):
198
204
199
205
def build_dataframe (args ):
200
206
print ("read_dataframe" ) ; sys .stdout .flush ()
207
+ import pandas as pd
201
208
df_y , df_cl , df_dd = read_dataframe (args )
202
209
print ("read_dataframe OK" ) ; sys .stdout .flush ()
203
210
@@ -224,7 +231,7 @@ def build_dataframe(args):
224
231
x_test_0 = df_cl .iloc [~ df_cl .index .isin (tr_vl_idx ), :].reset_index (drop = True )
225
232
x_test_1 = df_dd .iloc [~ df_dd .index .isin (tr_vl_idx ), :].reset_index (drop = True )
226
233
x_test_1 .columns = ['' ] * len (x_val_1 .columns )
227
- else :
234
+ else : # args.fold is None
228
235
# train_mask, val_mask = build_masks(args, df_y)
229
236
train_mask , val_mask , test_mask = build_masks_w_holdout (args , df_y )
230
237
print (str (train_mask ))
@@ -278,6 +285,56 @@ def build_dataframe(args):
278
285
store .close ()
279
286
280
287
288
+ def print_line (line ):
289
+ """line: list of str"""
290
+ if len (line ) == 0 :
291
+ return
292
+ # Indent
293
+ print (" " , end = "" )
294
+ text = " " .join (line )
295
+ print (text )
296
+
297
+
298
+ def show_list (L ):
299
+ """
300
+ Show list entries in indented 70-character lines,
301
+ ending on blank line
302
+ """
303
+ limit = 70
304
+ # Current character in line:
305
+ c = 0
306
+
307
+ line = []
308
+ for entry in L :
309
+ s = str (entry )
310
+ # Include space between last entry and this one
311
+ n = len (s ) + 1
312
+ c += n
313
+ if c > limit :
314
+ print_line (line )
315
+ line .clear ()
316
+ c = len (s )
317
+ line .append (s )
318
+
319
+ print_line (line )
320
+ print ("" )
321
+
322
+
323
+ def show (args ):
324
+ """Simply show the entry for this node"""
325
+ plan_dict = read_plan (args .plan , args .node )
326
+ print (str (plan_dict ))
327
+ val_cells = plan_dict ['val' ][0 ]['cell' ]
328
+ print ("val cells: count: %i" % len (val_cells ))
329
+ show_list (val_cells )
330
+ train_cells = plan_dict ['train' ][0 ]['cell' ]
331
+ print ("train cells: count: %i" % len (train_cells ))
332
+ show_list (train_cells )
333
+
334
+
281
335
if __name__ == '__main__' :
282
336
parsed , unparsed = parse_arguments ()
283
- build_dataframe (parsed )
337
+ if parsed .show :
338
+ show (parsed )
339
+ else :
340
+ build_dataframe (parsed )
0 commit comments