Skip to content

Commit 222bbdc

Browse files
committed
moved splitting subroutine into its own function for easier debugging
1 parent 1dfcccf commit 222bbdc

File tree

1 file changed

+135
-118
lines changed

1 file changed

+135
-118
lines changed

scripts/prepare_data_for_improve.py

Lines changed: 135 additions & 118 deletions
Original file line numberDiff line numberDiff line change
@@ -157,6 +157,141 @@ def process_datasets(args):
157157
# creation of splits
158158
#-------------------------------------------------------------------
159159

160+
161+
# TODO: potentially change vars to be read from `args`
162+
split_data_sets(
163+
args=args,
164+
data_sets=data_sets,
165+
data_sets_info=data_sets_info,
166+
response_data=response_data
167+
)
168+
169+
#-------------------------------------------------------------------
170+
# getting common / reference gene symbols
171+
#-------------------------------------------------------------------
172+
173+
# TODO: potentially add mapping to the genes table in coderdata
174+
# currently we do not make use of the 'genes' DataFrame in a Dataset
175+
# object. The gene symbol information comes directly from HGNC.
176+
# There are instances where the entrez_id that is recoreded in the
177+
# expression / transcriptome is not in HGNC. Those currenly result
178+
# in NaNs for the gene symbol
179+
180+
data_gene_names = pd.read_table(
181+
filepath_or_buffer=args.GENE_TABLE,
182+
)
183+
data_gene_names.rename(
184+
columns={
185+
'NCBI Gene ID': 'entrez_id',
186+
'Ensembl gene ID': 'ensemble_gene_id',
187+
'Approved symbol': 'gene_symbol'
188+
},
189+
inplace=True,
190+
)
191+
data_gene_names.dropna(axis=0, subset='entrez_id', inplace=True)
192+
data_gene_names['entrez_id'] = data_gene_names['entrez_id'].astype(int)
193+
194+
#-------------------------------------------------------------------
195+
# create gene expression master table
196+
#-------------------------------------------------------------------
197+
198+
merged_transcriptomics = merge_master_tables(
199+
args=args,
200+
data_sets=data_sets,
201+
data_type='transcriptomics'
202+
)
203+
204+
# TODO: Potentially cast 'NaN's to 0
205+
206+
# merging ensemble gene id & gene symbol into the transcriptomics
207+
# data
208+
merged_transcriptomics = pd.merge(
209+
merged_transcriptomics,
210+
data_gene_names[[
211+
'entrez_id',
212+
'ensemble_gene_id',
213+
'gene_symbol'
214+
]],
215+
how='left',
216+
on='entrez_id',
217+
)
218+
219+
# moving ensemble_id & gene_symbol columns to the front of the table
220+
# such that when transposing the DataFrame they are row 3 and 2
221+
# respectively
222+
merged_transcriptomics.insert(
223+
1,
224+
'ensemble_gene_id',
225+
merged_transcriptomics.pop('ensemble_gene_id')
226+
)
227+
merged_transcriptomics.insert(
228+
1,
229+
'gene_symbol',
230+
merged_transcriptomics.pop('gene_symbol')
231+
)
232+
233+
# writing the expression datatable to '/x_data/*_expression.tsv'
234+
outfile_path = args.WORKDIR.joinpath(
235+
"data_out",
236+
"x_data",
237+
"cancer_gene_expression.tsv"
238+
)
239+
merged_transcriptomics.transpose().to_csv(
240+
path_or_buf=outfile_path,
241+
sep='\t',
242+
header=False
243+
)
244+
245+
246+
#-------------------------------------------------------------------
247+
# create copynumber master table
248+
#-------------------------------------------------------------------
249+
250+
merged_copy_number = merge_master_tables(args, data_sets=data_sets, data_type='copy_number')
251+
252+
merged_copy_number = pd.merge(
253+
merged_copy_number,
254+
data_gene_names[[
255+
'entrez_id',
256+
'ensemble_gene_id',
257+
'gene_symbol'
258+
]],
259+
how='left',
260+
on='entrez_id',
261+
)
262+
263+
merged_copy_number.insert(
264+
1,
265+
'ensemble_gene_id',
266+
merged_copy_number.pop('ensemble_gene_id')
267+
)
268+
merged_copy_number.insert(
269+
1,
270+
'gene_symbol',
271+
merged_copy_number.pop('gene_symbol')
272+
)
273+
274+
# writing the expression datatable to '/x_data/*_copy_number.tsv'
275+
outfile_path = args.WORKDIR.joinpath(
276+
"data_out",
277+
"x_data",
278+
"cancer_copy_number.tsv"
279+
)
280+
merged_copy_number.transpose().to_csv(
281+
path_or_buf=outfile_path,
282+
sep='\t',
283+
header=False
284+
)
285+
# join the "meta data tables" like copynumber etc.
286+
287+
288+
def split_data_sets(
289+
args: dict,
290+
data_sets: dict,
291+
data_sets_info: dict,
292+
response_data: pd.DataFrame
293+
):
294+
160295
splits_folder = args.WORKDIR.joinpath('data_out', 'splits')
161296
split_type = args.SPLIT_TYPE
162297
# TODO: potentially change vars to be read from `args`
@@ -292,124 +427,6 @@ def process_datasets(args):
292427
header=False
293428
)
294429

295-
#-------------------------------------------------------------------
296-
# getting common / reference gene symbols
297-
#-------------------------------------------------------------------
298-
299-
# TODO: potentially add mapping to the genes table in coderdata
300-
# currently we do not make use of the 'genes' DataFrame in a Dataset
301-
# object. The gene symbol information comes directly from HGNC.
302-
# There are instances where the entrez_id that is recoreded in the
303-
# expression / transcriptome is not in HGNC. Those currenly result
304-
# in NaNs for the gene symbol
305-
306-
data_gene_names = pd.read_table(
307-
filepath_or_buffer=args.GENE_TABLE,
308-
)
309-
data_gene_names.rename(
310-
columns={
311-
'NCBI Gene ID': 'entrez_id',
312-
'Ensembl gene ID': 'ensemble_gene_id',
313-
'Approved symbol': 'gene_symbol'
314-
},
315-
inplace=True,
316-
)
317-
data_gene_names.dropna(axis=0, subset='entrez_id', inplace=True)
318-
data_gene_names['entrez_id'] = data_gene_names['entrez_id'].astype(int)
319-
320-
#-------------------------------------------------------------------
321-
# create gene expression master table
322-
#-------------------------------------------------------------------
323-
324-
merged_transcriptomics = merge_master_tables(
325-
args=args,
326-
data_sets=data_sets,
327-
data_type='transcriptomics'
328-
)
329-
330-
# TODO: Potentially cast 'NaN's to 0
331-
332-
# merging ensemble gene id & gene symbol into the transcriptomics
333-
# data
334-
merged_transcriptomics = pd.merge(
335-
merged_transcriptomics,
336-
data_gene_names[[
337-
'entrez_id',
338-
'ensemble_gene_id',
339-
'gene_symbol'
340-
]],
341-
how='left',
342-
on='entrez_id',
343-
)
344-
345-
# moving ensemble_id & gene_symbol columns to the front of the table
346-
# such that when transposing the DataFrame they are row 3 and 2
347-
# respectively
348-
merged_transcriptomics.insert(
349-
1,
350-
'ensemble_gene_id',
351-
merged_transcriptomics.pop('ensemble_gene_id')
352-
)
353-
merged_transcriptomics.insert(
354-
1,
355-
'gene_symbol',
356-
merged_transcriptomics.pop('gene_symbol')
357-
)
358-
359-
# writing the expression datatable to '/x_data/*_expression.tsv'
360-
outfile_path = args.WORKDIR.joinpath(
361-
"data_out",
362-
"x_data",
363-
"cancer_gene_expression.tsv"
364-
)
365-
merged_transcriptomics.transpose().to_csv(
366-
path_or_buf=outfile_path,
367-
sep='\t',
368-
header=False
369-
)
370-
371-
372-
#-------------------------------------------------------------------
373-
# create copynumber master table
374-
#-------------------------------------------------------------------
375-
376-
merged_copy_number = merge_master_tables(args, data_sets=data_sets, data_type='copy_number')
377-
378-
merged_copy_number = pd.merge(
379-
merged_copy_number,
380-
data_gene_names[[
381-
'entrez_id',
382-
'ensemble_gene_id',
383-
'gene_symbol'
384-
]],
385-
how='left',
386-
on='entrez_id',
387-
)
388-
389-
merged_copy_number.insert(
390-
1,
391-
'ensemble_gene_id',
392-
merged_copy_number.pop('ensemble_gene_id')
393-
)
394-
merged_copy_number.insert(
395-
1,
396-
'gene_symbol',
397-
merged_copy_number.pop('gene_symbol')
398-
)
399-
400-
# writing the expression datatable to '/x_data/*_copy_number.tsv'
401-
outfile_path = args.WORKDIR.joinpath(
402-
"data_out",
403-
"x_data",
404-
"cancer_copy_number.tsv"
405-
)
406-
merged_copy_number.transpose().to_csv(
407-
path_or_buf=outfile_path,
408-
sep='\t',
409-
header=False
410-
)
411-
# join the "meta data tables" like copynumber etc.
412-
413430

414431
def merge_master_tables(args, data_sets, data_type: str='transcriptomics'):
415432
"""

0 commit comments

Comments
 (0)