@@ -84,25 +84,56 @@ def get_protein_id_from_df_mutations(df_mutations, file_sqlite, cpu_counts=10):
8484
8585 return combined_results
8686
87- def convert_df_transcript2_to_df_transcript3_helper (protein_id , df_transcript2 , df_mutations , individual , ** kargs ):
87+ def convert_df_transcript2_to_df_transcript3_helper (protein_id , df_transcript2 , df_mutations , individual , kargs ):
8888 '''
8989 protein_id is a protein_id in df_transcript2
9090 for each protein_id, get the mutations in each individual.
9191 return a list of tuple, each tuple is (tuple of variant_index, individuals with this variant_index joined by ',')
92+
93+ If shape and file_memmap are provided in kargs, it means we're dealing with an extra large mutation file
94+ where individual data is stored in a memory-mapped file instead of in the DataFrame.
9295 '''
9396 mutations = df_transcript2 .loc [protein_id ]['mutations' ]
94- tdf_m = df_mutations .loc [mutations ]
95- tdc = {}
96- for sample in individual :
97- if sample not in tdf_m .columns :
98- print ('warning:' , sample , 'not in mutation columns and is ignored! unexpected error may happen!' )
99- continue
100- tdf_m_single = tdf_m [tdf_m [sample ].apply (is_valid )]
101- variant_index = tuple (tdf_m_single .index )
102- if len (variant_index ) > 0 :
103- if variant_index not in tdc :
104- tdc [variant_index ] = []
105- tdc [variant_index ].append (sample )
97+
98+ # Check if we're using memory-mapped file for individual data
99+ if 'shape' in kargs and 'file_memmap' in kargs :
100+ # Using memory-mapped file for individual data
101+ shape = kargs ['shape' ]
102+ file_memmap = kargs ['file_memmap' ]
103+
104+ # Open the memory-mapped file in read mode
105+ mmap = np .memmap (file_memmap , dtype = 'int8' , mode = 'r' , shape = shape )
106+
107+ # Get the indices of mutations for this protein
108+ mutation_indices = mutations
109+
110+ tdc = {}
111+ for i , sample in enumerate (individual ):
112+ # For each sample, check which mutations are valid (value is 1)
113+ valid_mutations = []
114+ for idx in mutation_indices :
115+ if idx < mmap .shape [0 ] and i < mmap .shape [1 ] and mmap [idx , i ] == 1 :
116+ valid_mutations .append (idx )
117+
118+ variant_index = tuple (valid_mutations )
119+ if len (variant_index ) > 0 :
120+ if variant_index not in tdc :
121+ tdc [variant_index ] = []
122+ tdc [variant_index ].append (sample )
123+ else :
124+ # Using regular DataFrame for individual data
125+ tdf_m = df_mutations .loc [mutations ]
126+ tdc = {}
127+ for sample in individual :
128+ if sample not in tdf_m .columns :
129+ print ('warning:' , sample , 'not in mutation columns and is ignored! unexpected error may happen!' )
130+ continue
131+ tdf_m_single = tdf_m [tdf_m [sample ].apply (is_valid )]
132+ variant_index = tuple (tdf_m_single .index )
133+ if len (variant_index ) > 0 :
134+ if variant_index not in tdc :
135+ tdc [variant_index ] = []
136+ tdc [variant_index ].append (sample )
106137
107138 tdc = {k :',' .join (v ) for k ,v in tdc .items ()}
108139 return list (tdc .items ())
@@ -114,6 +145,8 @@ def convert_df_transcript2_to_df_transcript3(df_transcript2, df_mutations, indiv
114145 else, group mutations in df_transcript2 by individual. rename index by adding __X, and add column 'individual'
115146 individual is a list of samples in df_mutations
116147
148+ If shape and file_memmap are provided in kargs, it means we're dealing with an extra large mutation file
149+ where individual data is stored in a memory-mapped file instead of in the DataFrame.
117150 '''
118151 if individual is None or individual == '' or individual == 'None' or individual == [] or individual == 'ALL_VARIANTS' :
119152 return df_transcript2
@@ -122,17 +155,20 @@ def convert_df_transcript2_to_df_transcript3(df_transcript2, df_mutations, indiv
122155 individual = individual
123156 elif individual == 'ALL_SAMPLES' :
124157 individual = [i for i in df_mutations .columns if i not in ['chr' , 'pos' , '' , 'ref' , 'alt' , 'pos_end' ]]
125- print ('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:' , individual )
158+ if len (individual ) < 100 :
159+ print ('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:' , individual )
160+ else :
161+ print ('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. {} individuals were used' .format (len (individual )))
126162 elif ',' in individual :
127163 individual = individual .split (',' )
128164 else :
129165 individual = [individual ]
130166
131167 if cpu_counts == 1 :
132- results = [convert_df_transcript2_to_df_transcript3_helper (protein_id , df_transcript2 , df_mutations , individual ) for protein_id in df_transcript2 .index ]
168+ results = [convert_df_transcript2_to_df_transcript3_helper (protein_id , df_transcript2 , df_mutations , individual , kargs ) for protein_id in df_transcript2 .index ]
133169 else :
134170 pool = Pool (cpu_counts )
135- results = pool .starmap (convert_df_transcript2_to_df_transcript3_helper , [(protein_id , df_transcript2 , df_mutations , individual ) for protein_id in df_transcript2 .index ])
171+ results = pool .starmap (convert_df_transcript2_to_df_transcript3_helper , [(protein_id , df_transcript2 , df_mutations , individual , kargs ) for protein_id in df_transcript2 .index ])
136172 pool .close ()
137173 pool .join ()
138174
@@ -227,7 +263,11 @@ def __init__(
227263 individual = individual
228264 elif individual == 'ALL_SAMPLES' :
229265 individual = [i for i in df_mutations .columns if i not in ['chr' , 'pos' , '' , 'ref' , 'alt' , 'pos_end' ]]
230- print ('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:' , individual )
266+ if len (individual ) < 100 :
267+ print ('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:' , individual )
268+ else :
269+ print ('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. {} individuals were used' .format (len (individual )))
270+
231271 elif ',' in individual :
232272 individual = individual .split (',' )
233273 elif individual in df_mutations .columns :
@@ -245,6 +285,15 @@ def __init__(
245285 tsv2memmap (file_mutations , individuals = self .individual , memmap_file = file_mutations + '.memmap' )
246286 self .shape = shape
247287 self .file_memmap = file_mutations + '.memmap'
288+
289+ if chromosome :
290+ chromosome_with_genomicLocs = buildSqlite .get_genomicLocs_chromosomes (file_sqlite )
291+ if chromosome in chromosome_with_genomicLocs :
292+ self .df_mutations ['chr' ] = chromosome
293+ elif chromosome .startswith ('chr' ) and chromosome [3 :] in chromosome_with_genomicLocs :
294+ self .df_mutations ['chr' ] = chromosome [3 :]
295+ else :
296+ print (f'warning! chromosome { chromosome } not in file_sqlite' )
248297
249298
250299
@@ -291,7 +340,7 @@ def run_perChrom(self, save_results = True):
291340 df_transcript2 .at [transcript_id , 'mutations' ] = []
292341 df_transcript2 = df_transcript2 [~ df_transcript2 .index .isin (set (tdf_special .index ))]
293342
294- if not self .extra_large_file_mutation
343+ if not self .extra_large_file_mutation :
295344 df_transcript3 = convert_df_transcript2_to_df_transcript3 (df_transcript2 , df_mutations , individual = individual , cpu_counts = cpu_counts )
296345 else :
297346 df_transcript3 = convert_df_transcript2_to_df_transcript3 (df_transcript2 , df_mutations , individual = individual , cpu_counts = cpu_counts , shape = self .shape , file_memmap = self .file_memmap )
@@ -341,6 +390,8 @@ def main():
341390 parser .add_argument ('-s' , '--sample' , help = '''
342391 sample name in the vcf to extract the variant information. default: None, use all variants and do not consider samples.
343392 For multiple samples, use "," to join the sample names. For example, "--sample sample1,sample2,sample3".
393+ To use all samples, use "--sample ALL_SAMPLES".
394+ To use all variants regardless where the variants from, use "--sample ALL_VARIANTS".
344395 ''' , default = None )
345396
346397 if TEST :
0 commit comments