@@ -157,6 +157,141 @@ def process_datasets(args):
157157 # creation of splits
158158 #-------------------------------------------------------------------
159159
160+
161+ # TODO: potentially change vars to be read from `args`
162+ split_data_sets (
163+ args = args ,
164+ data_sets = data_sets ,
165+ data_sets_info = data_sets_info ,
166+ response_data = response_data
167+ )
168+
169+ #-------------------------------------------------------------------
170+ # getting common / reference gene symbols
171+ #-------------------------------------------------------------------
172+
173+ # TODO: potentially add mapping to the genes table in coderdata
174+ # currently we do not make use of the 'genes' DataFrame in a Dataset
175+ # object. The gene symbol information comes directly from HGNC.
176+ # There are instances where the entrez_id that is recoreded in the
177+ # expression / transcriptome is not in HGNC. Those currenly result
178+ # in NaNs for the gene symbol
179+
180+ data_gene_names = pd .read_table (
181+ filepath_or_buffer = args .GENE_TABLE ,
182+ )
183+ data_gene_names .rename (
184+ columns = {
185+ 'NCBI Gene ID' : 'entrez_id' ,
186+ 'Ensembl gene ID' : 'ensemble_gene_id' ,
187+ 'Approved symbol' : 'gene_symbol'
188+ },
189+ inplace = True ,
190+ )
191+ data_gene_names .dropna (axis = 0 , subset = 'entrez_id' , inplace = True )
192+ data_gene_names ['entrez_id' ] = data_gene_names ['entrez_id' ].astype (int )
193+
194+ #-------------------------------------------------------------------
195+ # create gene expression master table
196+ #-------------------------------------------------------------------
197+
198+ merged_transcriptomics = merge_master_tables (
199+ args = args ,
200+ data_sets = data_sets ,
201+ data_type = 'transcriptomics'
202+ )
203+
204+ # TODO: Potentially cast 'NaN's to 0
205+
206+ # merging ensemble gene id & gene symbol into the transcriptomics
207+ # data
208+ merged_transcriptomics = pd .merge (
209+ merged_transcriptomics ,
210+ data_gene_names [[
211+ 'entrez_id' ,
212+ 'ensemble_gene_id' ,
213+ 'gene_symbol'
214+ ]],
215+ how = 'left' ,
216+ on = 'entrez_id' ,
217+ )
218+
219+ # moving ensemble_id & gene_symbol columns to the front of the table
220+ # such that when transposing the DataFrame they are row 3 and 2
221+ # respectively
222+ merged_transcriptomics .insert (
223+ 1 ,
224+ 'ensemble_gene_id' ,
225+ merged_transcriptomics .pop ('ensemble_gene_id' )
226+ )
227+ merged_transcriptomics .insert (
228+ 1 ,
229+ 'gene_symbol' ,
230+ merged_transcriptomics .pop ('gene_symbol' )
231+ )
232+
233+ # writing the expression datatable to '/x_data/*_expression.tsv'
234+ outfile_path = args .WORKDIR .joinpath (
235+ "data_out" ,
236+ "x_data" ,
237+ "cancer_gene_expression.tsv"
238+ )
239+ merged_transcriptomics .transpose ().to_csv (
240+ path_or_buf = outfile_path ,
241+ sep = '\t ' ,
242+ header = False
243+ )
244+
245+
246+ #-------------------------------------------------------------------
247+ # create copynumber master table
248+ #-------------------------------------------------------------------
249+
250+ merged_copy_number = merge_master_tables (args , data_sets = data_sets , data_type = 'copy_number' )
251+
252+ merged_copy_number = pd .merge (
253+ merged_copy_number ,
254+ data_gene_names [[
255+ 'entrez_id' ,
256+ 'ensemble_gene_id' ,
257+ 'gene_symbol'
258+ ]],
259+ how = 'left' ,
260+ on = 'entrez_id' ,
261+ )
262+
263+ merged_copy_number .insert (
264+ 1 ,
265+ 'ensemble_gene_id' ,
266+ merged_copy_number .pop ('ensemble_gene_id' )
267+ )
268+ merged_copy_number .insert (
269+ 1 ,
270+ 'gene_symbol' ,
271+ merged_copy_number .pop ('gene_symbol' )
272+ )
273+
274+ # writing the expression datatable to '/x_data/*_copy_number.tsv'
275+ outfile_path = args .WORKDIR .joinpath (
276+ "data_out" ,
277+ "x_data" ,
278+ "cancer_copy_number.tsv"
279+ )
280+ merged_copy_number .transpose ().to_csv (
281+ path_or_buf = outfile_path ,
282+ sep = '\t ' ,
283+ header = False
284+ )
285+ # join the "meta data tables" like copynumber etc.
286+
287+
288+ def split_data_sets (
289+ args : dict ,
290+ data_sets : dict ,
291+ data_sets_info : dict ,
292+ response_data : pd .DataFrame
293+ ):
294+
160295 splits_folder = args .WORKDIR .joinpath ('data_out' , 'splits' )
161296 split_type = args .SPLIT_TYPE
162297 # TODO: potentially change vars to be read from `args`
@@ -292,124 +427,6 @@ def process_datasets(args):
292427 header = False
293428 )
294429
295- #-------------------------------------------------------------------
296- # getting common / reference gene symbols
297- #-------------------------------------------------------------------
298-
299- # TODO: potentially add mapping to the genes table in coderdata
300- # currently we do not make use of the 'genes' DataFrame in a Dataset
301- # object. The gene symbol information comes directly from HGNC.
302- # There are instances where the entrez_id that is recoreded in the
303- # expression / transcriptome is not in HGNC. Those currenly result
304- # in NaNs for the gene symbol
305-
306- data_gene_names = pd .read_table (
307- filepath_or_buffer = args .GENE_TABLE ,
308- )
309- data_gene_names .rename (
310- columns = {
311- 'NCBI Gene ID' : 'entrez_id' ,
312- 'Ensembl gene ID' : 'ensemble_gene_id' ,
313- 'Approved symbol' : 'gene_symbol'
314- },
315- inplace = True ,
316- )
317- data_gene_names .dropna (axis = 0 , subset = 'entrez_id' , inplace = True )
318- data_gene_names ['entrez_id' ] = data_gene_names ['entrez_id' ].astype (int )
319-
320- #-------------------------------------------------------------------
321- # create gene expression master table
322- #-------------------------------------------------------------------
323-
324- merged_transcriptomics = merge_master_tables (
325- args = args ,
326- data_sets = data_sets ,
327- data_type = 'transcriptomics'
328- )
329-
330- # TODO: Potentially cast 'NaN's to 0
331-
332- # merging ensemble gene id & gene symbol into the transcriptomics
333- # data
334- merged_transcriptomics = pd .merge (
335- merged_transcriptomics ,
336- data_gene_names [[
337- 'entrez_id' ,
338- 'ensemble_gene_id' ,
339- 'gene_symbol'
340- ]],
341- how = 'left' ,
342- on = 'entrez_id' ,
343- )
344-
345- # moving ensemble_id & gene_symbol columns to the front of the table
346- # such that when transposing the DataFrame they are row 3 and 2
347- # respectively
348- merged_transcriptomics .insert (
349- 1 ,
350- 'ensemble_gene_id' ,
351- merged_transcriptomics .pop ('ensemble_gene_id' )
352- )
353- merged_transcriptomics .insert (
354- 1 ,
355- 'gene_symbol' ,
356- merged_transcriptomics .pop ('gene_symbol' )
357- )
358-
359- # writing the expression datatable to '/x_data/*_expression.tsv'
360- outfile_path = args .WORKDIR .joinpath (
361- "data_out" ,
362- "x_data" ,
363- "cancer_gene_expression.tsv"
364- )
365- merged_transcriptomics .transpose ().to_csv (
366- path_or_buf = outfile_path ,
367- sep = '\t ' ,
368- header = False
369- )
370-
371-
372- #-------------------------------------------------------------------
373- # create copynumber master table
374- #-------------------------------------------------------------------
375-
376- merged_copy_number = merge_master_tables (args , data_sets = data_sets , data_type = 'copy_number' )
377-
378- merged_copy_number = pd .merge (
379- merged_copy_number ,
380- data_gene_names [[
381- 'entrez_id' ,
382- 'ensemble_gene_id' ,
383- 'gene_symbol'
384- ]],
385- how = 'left' ,
386- on = 'entrez_id' ,
387- )
388-
389- merged_copy_number .insert (
390- 1 ,
391- 'ensemble_gene_id' ,
392- merged_copy_number .pop ('ensemble_gene_id' )
393- )
394- merged_copy_number .insert (
395- 1 ,
396- 'gene_symbol' ,
397- merged_copy_number .pop ('gene_symbol' )
398- )
399-
400- # writing the expression datatable to '/x_data/*_copy_number.tsv'
401- outfile_path = args .WORKDIR .joinpath (
402- "data_out" ,
403- "x_data" ,
404- "cancer_copy_number.tsv"
405- )
406- merged_copy_number .transpose ().to_csv (
407- path_or_buf = outfile_path ,
408- sep = '\t ' ,
409- header = False
410- )
411- # join the "meta data tables" like copynumber etc.
412-
413430
414431def merge_master_tables (args , data_sets , data_type : str = 'transcriptomics' ):
415432 """
0 commit comments