3939 file_sqlite = '/XCLabServer002_fastIO/examples/GENCODE/GENCODE.tsv.sqlite'
4040
4141
42- def runSinglePerChromSqlite (file_sqlite , file_mutations , tempfolder , threads , chromosome , datatype , individual ):
42+ def runSinglePerChromSqlite (file_sqlite , file_mutations , tempfolder , threads , chromosome , datatype , individual , force_memmap = False ):
4343 '''
4444 run PerChrom_sqlite for a single chromosome
4545 '''
@@ -51,7 +51,8 @@ def runSinglePerChromSqlite(file_sqlite, file_mutations, tempfolder, threads, ch
5151 outprefix = outprefix ,
5252 datatype = datatype ,
5353 chromosome = chromosome ,
54- individual = individual )
54+ individual = individual ,
55+ force_memmap = force_memmap )
5556 print ('run perchrom_sqlite for chromosome' , chromosome )
5657 # perchrom_sqlite.run_perChrom()
5758 # print('finished running perchrom_sqlite for chromosome:', chromosome)
@@ -65,7 +66,7 @@ def runSinglePerChromSqlite(file_sqlite, file_mutations, tempfolder, threads, ch
6566 return None
6667
6768
68- def runPerChomSqlite (file_sqlite , file_mutations , threads , outprefix , protein_keyword , datatype , keep_all , individual , chromosomes_genome , chromosomes_genome_description , file_gtf ):
69+ def runPerChomSqlite (file_sqlite , file_mutations , threads , outprefix , protein_keyword , datatype , keep_all , individual , chromosomes_genome , chromosomes_genome_description , file_gtf , force_memmap = False ):
6970 '''
7071 '''
7172 # split file_mutations by chromosome
@@ -83,27 +84,43 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
8384
8485 if individual == 'ALL_VARIANTS' :
8586 individual = ''
87+ # only force memmap when sample information is retained
88+ use_force_memmap = force_memmap and individual not in ['' , None ]
8689 tempfolder = pergeno .tempfolder
8790 pergeno .chromosomes_genome = chromosomes_genome
88- chromosomes_mutation = pergeno .splitMutationByChromosomeLarge (chromosomes_genome_description = chromosomes_genome_description , chromosomes_genome = chromosomes_genome )
91+ individual_columns = None
92+ if isinstance (individual , str ) and individual not in ['' , 'None' , 'ALL_SAMPLES' ]:
93+ individual_columns = [i .strip () for i in individual .split (',' ) if i .strip ()]
94+ elif isinstance (individual , (list , tuple )):
95+ individual_columns = [i for i in individual if i ]
96+ chromosomes_mutation = pergeno .splitMutationByChromosomeLarge (
97+ chromosomes_genome_description = chromosomes_genome_description ,
98+ chromosomes_genome = chromosomes_genome ,
99+ individual_columns = individual_columns ,
100+ enable_memmap = use_force_memmap
101+ )
89102
90103 # run tsv2memmap here with multiple threads to save time
91104 files_mutation_to_convert = [f'{ tempfolder } /{ i } .mutation.tsv' for i in chromosomes_mutation ]
92105 files_mutation_to_convert = [i for i in files_mutation_to_convert if os .path .getsize (i ) > 100000000 ]
93- if len (files_mutation_to_convert ) > 0 and threads > 1 and individual != '' :
94- if individual == 'ALL_SAMPLES' :
95- columns_in_file_mutation = openFile ( files_mutation_to_convert [ 0 ], 'r' ). readline (). strip (). split ( ' \t ' )
96- individual_for_memmap = [ i for i in columns_in_file_mutation if i not in [ 'chr' , 'pos' , '' , 'ref' , 'alt' , 'pos_end' ]]
106+ if len (files_mutation_to_convert ) > 0 and threads > 1 and individual != '' and not use_force_memmap :
107+ files_mutation_to_convert = [ i for i in files_mutation_to_convert if not os . path . exists ( i + '.memmap' )]
108+ if len ( files_mutation_to_convert ) == 0 :
109+ print ( 'memmap files already present for large chromosomes, skip regeneration.' )
97110 else :
98- individual_for_memmap = individual
99- from vcf2mutation import tsv2memmap
100- pool = Pool (threads )
101- pool .starmap (tsv2memmap , [(i , individual_for_memmap , i + '.memmap' ) for i in files_mutation_to_convert ], chunksize = 1 )
102- pool .close ()
103- pool .join ()
111+ if individual == 'ALL_SAMPLES' :
112+ columns_in_file_mutation = openFile (files_mutation_to_convert [0 ], 'r' ).readline ().strip ().split ('\t ' )
113+ individual_for_memmap = [i for i in columns_in_file_mutation if i not in ['chr' , 'pos' , '' , 'ref' , 'alt' , 'pos_end' ]]
114+ else :
115+ individual_for_memmap = individual
116+ from vcf2mutation import tsv2memmap
117+ pool = Pool (threads )
118+ pool .starmap (tsv2memmap , [(i , individual_for_memmap , i + '.memmap' ) for i in files_mutation_to_convert ], chunksize = 1 )
119+ pool .close ()
120+ pool .join ()
104121
105122 # run runSinglePerChromSqlite
106- chromosomes_mutated = [runSinglePerChromSqlite (file_sqlite , f'{ tempfolder } /{ chromosome } .mutation.tsv' , tempfolder , threads , chromosome , datatype , individual ) for chromosome in chromosomes_mutation ]
123+ chromosomes_mutated = [runSinglePerChromSqlite (file_sqlite , f'{ tempfolder } /{ chromosome } .mutation.tsv' , tempfolder , threads , chromosome , datatype , individual , force_memmap = use_force_memmap ) for chromosome in chromosomes_mutation ]
107124 # successful chromosomes
108125 chromosomes_mutated = [e for e in chromosomes_mutated if e is not None ]
109126 # collect mutation annotations
@@ -115,8 +132,22 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
115132
116133 # collect individual information for later use when adding unchanged proteins per individual
117134 all_individuals = []
135+ base_columns = {'chr' , 'pos' , 'ref' , 'alt' , 'pos_end' }
118136 if isinstance (individual , str ):
119- if individual not in ['' , 'None' ]:
137+ if individual == 'ALL_SAMPLES' :
138+ # Derive the actual sample columns from one of the split mutation files so
139+ # unchanged proteins can still be emitted when some individuals remain ref-only.
140+ sample_header_file = None
141+ for chromosome in chromosomes_mutation :
142+ candidate = f'{ tempfolder } /{ chromosome } .mutation.tsv'
143+ if os .path .exists (candidate ):
144+ sample_header_file = candidate
145+ break
146+ if sample_header_file :
147+ with openFile (sample_header_file , 'r' ) as fo :
148+ header_columns = fo .readline ().strip ().split ('\t ' )
149+ all_individuals = [col for col in header_columns if col not in base_columns ]
150+ elif individual not in ['' , 'None' ]:
120151 all_individuals = [i .strip () for i in individual .split (',' ) if i .strip ()]
121152 elif isinstance (individual , (list , tuple )):
122153 all_individuals = [i for i in individual if i ]
@@ -190,6 +221,7 @@ def runPerChomSqlite_vcf(file_mutations, file_sqlite, threads, outprefix, dataty
190221 # get two mutation files from vcf file
191222 print ('start extracting mutation file from the vcf input' )
192223 outprefix_vcf = outprefix + '.vcf2mutation'
224+ force_memmap = (individual == 'ALL_SAMPLES' )
193225 individual = convertVCF2MutationComplex (file_vcf = file_mutations , outprefix = outprefix_vcf , individual_input = individual , filter_PASS = filter_PASS , chromosome_only = chromosome_only , info_field = info_field , info_field_thres = info_field_thres , threads = threads )
194226 individual = ',' .join (individual )
195227 print ('finished extracting mutations from the vcf file' )
@@ -208,7 +240,8 @@ def runPerChomSqlite_vcf(file_mutations, file_sqlite, threads, outprefix, dataty
208240 individual ,
209241 chromosomes_genome ,
210242 chromosomes_genome_description ,
211- file_gtf
243+ file_gtf ,
244+ force_memmap = force_memmap
212245 )
213246
214247
@@ -368,7 +401,8 @@ def main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protei
368401 individual = individual ,
369402 chromosomes_genome = chromosomes_genome ,
370403 chromosomes_genome_description = chromosomes_genome_description ,
371- file_gtf = file_gtf
404+ file_gtf = file_gtf ,
405+ force_memmap = (individual == 'ALL_SAMPLES' )
372406 )
373407
374408
0 commit comments