Skip to content

Commit 0727e9a

Browse files
committed
support for ALL_SAMPLES with huge input size
1 parent 45de197 commit 0727e9a

File tree

4 files changed

+28
-4
lines changed

4 files changed

+28
-4
lines changed

src/PrecisionProDB_Sqlite.py

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313
from PrecisionProDB_core import PerGeno, get_k_new
1414
import re
1515
import sqlite3
16+
from multiprocessing import Pool
1617

1718
# code below for testing the the program
1819
TEST = False
@@ -86,14 +87,29 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
8687
pergeno.chromosomes_genome = chromosomes_genome
8788
chromosomes_mutation = pergeno.splitMutationByChromosomeLarge(chromosomes_genome_description=chromosomes_genome_description, chromosomes_genome=chromosomes_genome)
8889

90+
# run tsv2memmap here with multiple threads to save time
91+
files_mutation_to_convert = [f'{tempfolder}/{i}.mutation.tsv' for i in chromosomes_mutation]
92+
files_mutation_to_convert = [i for i in files_mutation_to_convert if os.path.getsize(i) > 100000000]
93+
if len(files_mutation_to_convert) > 0 and threads > 1 and individual !='':
94+
if individual == 'ALL_SAMPLES':
95+
columns_in_file_mutation = openFile(files_mutation_to_convert[0], 'r').readline().strip().split('\t')
96+
individual_for_memmap = [i for i in columns_in_file_mutation if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']]
97+
else:
98+
individual_for_memmap = individual
99+
from vcf2mutation import tsv2memmap
100+
pool = Pool(threads)
101+
pool.starmap(tsv2memmap, [(i, individual_for_memmap, i +'.memmap') for i in files_mutation_to_convert], chunksize=1)
102+
pool.close()
103+
pool.join()
104+
89105
# run runSinglePerChromSqlite
90106
chromosomes_mutated = [runSinglePerChromSqlite(file_sqlite, f'{tempfolder}/{chromosome}.mutation.tsv', tempfolder, threads, chromosome, datatype, individual) for chromosome in chromosomes_mutation]
91107
# successful chromosomes
92108
chromosomes_mutated = [e for e in chromosomes_mutated if e is not None]
93109
# collect mutation annotations
94110
files_mutAnno = ['{}/{}.aa_mutations.csv'.format(tempfolder, chromosome) for chromosome in chromosomes_mutated]
95111
file_mutAnno = outprefix + '.pergeno.aa_mutations.csv'
96-
df_mutAnno = pd.concat([pd.read_csv(f, sep='\t') for f in files_mutAnno if os.path.exists(f)], ignore_index=True)
112+
df_mutAnno = pd.concat([pd.read_csv(f, sep='\t', low_memory=False) for f in files_mutAnno if os.path.exists(f)], ignore_index=True)
97113
print('total number of proteins with AA mutation:', df_mutAnno.shape[0])
98114
df_mutAnno.to_csv(file_mutAnno, sep='\t', index=None)
99115

src/PrecisionProDB_core.py

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -396,6 +396,12 @@ def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, ch
396396
file_mutations is generated from vcf file, no need to read with pandas or further processing
397397
'''
398398
tempfolder = self.tempfolder
399+
file_splitMutationByChromosomeLarge_done = os.path.join(tempfolder,'splitMutationByChromosomeLarge.done')
400+
if os.path.exists(file_splitMutationByChromosomeLarge_done):
401+
print('splitting the mutation file is already finished. use previous results')
402+
chromosomes_mutation = open(file_splitMutationByChromosomeLarge_done).read().strip().split('\n')
403+
return chromosomes_mutation
404+
399405
file_mutations = self.file_mutations
400406
if chromosomes_genome is None:
401407
chromosomes_genome = self.chromosomes_genome
@@ -421,6 +427,7 @@ def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, ch
421427
chromosomes_mutation = list(dc_output.keys())
422428

423429
print('finish splitting the mutation file')
430+
open(file_splitMutationByChromosomeLarge_done,'w').write('\n'.join(chromosomes_mutation))
424431
return chromosomes_mutation
425432

426433
def splitGtfByChromosomes(self,dc_protein2chr):

src/perChrom.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,7 @@ def parse_mutation(file_mutations, chromosome=None, columns_to_include=None, col
125125
if chromosome:
126126
df_mutations['chr'] = chromosome
127127
else:
128-
if os.path.getsize(file_mutations) < 1000000000:
128+
if os.path.getsize(file_mutations) < 100000000:
129129
df_mutations = pd.read_csv(file_mutations, sep='\t', low_memory=False)
130130
else:
131131
print(file_mutations, 'very large file, use readExtraLargeMutationFile')

src/perChromSqlite.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -249,7 +249,7 @@ def __init__(
249249
# if file_mutation is larger than 1G, only read ['chr', 'pos', 'ref', 'alt']
250250
if isinstance(self.file_mutations, str):
251251
if os.path.exists(self.file_mutations):
252-
if os.path.getsize(self.file_mutations) > 1000000000:
252+
if os.path.getsize(self.file_mutations) > 100000000:
253253
self.extra_large_file_mutation = True
254254

255255
if self.extra_large_file_mutation:
@@ -366,8 +366,9 @@ def run_perChrom(self, save_results = True):
366366
pool = Pool(cpu_counts)
367367
starmap_args = [[r, df_mutations] for _,r in df_transcript3.iterrows()]
368368

369-
chunk_size = min(200, total_tasks // cpu_counts // 4)
370369
total_tasks = df_transcript3.shape[0]
370+
chunk_size = min(200, total_tasks // cpu_counts // 4)
371+
371372
# results = pool.starmap(perChrom.translateCDSplusWithMut2, starmap_args, chunksize=100)
372373
imap_results = pool.imap(translate_wrapper, starmap_args, chunksize=chunk_size)
373374
if tqdm:

0 commit comments

Comments
 (0)