support for ALL_SAMPLES with huge input size

ATPs · ATPs · commit 0727e9a39125 · 2025-04-20T16:41:18.000+08:00
diff --git a/src/PrecisionProDB_Sqlite.py b/src/PrecisionProDB_Sqlite.py
@@ -13,6 +13,7 @@
 from PrecisionProDB_core import PerGeno, get_k_new
 import re
 import sqlite3
+from multiprocessing import Pool
 
 # code below for testing the the program
 TEST = False
@@ -86,14 +87,29 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
     pergeno.chromosomes_genome = chromosomes_genome
     chromosomes_mutation = pergeno.splitMutationByChromosomeLarge(chromosomes_genome_description=chromosomes_genome_description, chromosomes_genome=chromosomes_genome)
 
+    # run tsv2memmap here with multiple threads to save time
+    files_mutation_to_convert = [f'{tempfolder}/{i}.mutation.tsv' for i in chromosomes_mutation]
+    files_mutation_to_convert = [i for i in files_mutation_to_convert if os.path.getsize(i) > 100000000]
+    if len(files_mutation_to_convert) > 0 and threads > 1 and individual !='':
+        if individual == 'ALL_SAMPLES':
+            columns_in_file_mutation = openFile(files_mutation_to_convert[0], 'r').readline().strip().split('\t')
+            individual_for_memmap = [i for i in columns_in_file_mutation if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']]
+        else:
+            individual_for_memmap = individual
+        from vcf2mutation import tsv2memmap
+        pool = Pool(threads)
+        pool.starmap(tsv2memmap, [(i, individual_for_memmap, i +'.memmap') for i in files_mutation_to_convert], chunksize=1)
+        pool.close()
+        pool.join()
+    
     # run runSinglePerChromSqlite
     chromosomes_mutated = [runSinglePerChromSqlite(file_sqlite, f'{tempfolder}/{chromosome}.mutation.tsv', tempfolder, threads, chromosome, datatype, individual) for chromosome in chromosomes_mutation]
     # successful chromosomes
     chromosomes_mutated = [e for e in chromosomes_mutated if e is not None]
     # collect mutation annotations
     files_mutAnno = ['{}/{}.aa_mutations.csv'.format(tempfolder, chromosome) for chromosome in chromosomes_mutated]
     file_mutAnno = outprefix + '.pergeno.aa_mutations.csv'
-    df_mutAnno = pd.concat([pd.read_csv(f, sep='\t') for f in files_mutAnno if os.path.exists(f)], ignore_index=True)
+    df_mutAnno = pd.concat([pd.read_csv(f, sep='\t', low_memory=False) for f in files_mutAnno if os.path.exists(f)], ignore_index=True)
     print('total number of proteins with AA mutation:', df_mutAnno.shape[0])
     df_mutAnno.to_csv(file_mutAnno, sep='\t', index=None)
 
diff --git a/src/PrecisionProDB_core.py b/src/PrecisionProDB_core.py
@@ -396,6 +396,12 @@ def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, ch
         file_mutations is generated from vcf file, no need to read with pandas or further processing
         '''
         tempfolder = self.tempfolder
+        file_splitMutationByChromosomeLarge_done = os.path.join(tempfolder,'splitMutationByChromosomeLarge.done')
+        if os.path.exists(file_splitMutationByChromosomeLarge_done):
+            print('splitting the mutation file is already finished. use previous results')
+            chromosomes_mutation = open(file_splitMutationByChromosomeLarge_done).read().strip().split('\n')
+            return chromosomes_mutation
+        
         file_mutations = self.file_mutations
         if chromosomes_genome is None:
             chromosomes_genome = self.chromosomes_genome
@@ -421,6 +427,7 @@ def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, ch
         chromosomes_mutation = list(dc_output.keys())
         
         print('finish splitting the mutation file')
+        open(file_splitMutationByChromosomeLarge_done,'w').write('\n'.join(chromosomes_mutation))
         return chromosomes_mutation
 
     def splitGtfByChromosomes(self,dc_protein2chr):
diff --git a/src/perChrom.py b/src/perChrom.py
@@ -125,7 +125,7 @@ def parse_mutation(file_mutations, chromosome=None, columns_to_include=None, col
         if chromosome:
             df_mutations['chr'] = chromosome
     else:
-        if os.path.getsize(file_mutations) < 1000000000:
+        if os.path.getsize(file_mutations) < 100000000:
             df_mutations = pd.read_csv(file_mutations, sep='\t', low_memory=False)
         else:
             print(file_mutations, 'very large file, use readExtraLargeMutationFile')
diff --git a/src/perChromSqlite.py b/src/perChromSqlite.py
@@ -249,7 +249,7 @@ def __init__(
         # if file_mutation is larger than 1G, only read ['chr', 'pos', 'ref', 'alt']
         if isinstance(self.file_mutations, str):
             if os.path.exists(self.file_mutations):
-                if os.path.getsize(self.file_mutations) > 1000000000:
+                if os.path.getsize(self.file_mutations) > 100000000:
                     self.extra_large_file_mutation = True
         
         if self.extra_large_file_mutation:
@@ -366,8 +366,9 @@ def run_perChrom(self, save_results = True):
             pool = Pool(cpu_counts)
             starmap_args = [[r, df_mutations] for _,r in df_transcript3.iterrows()]
             
-            chunk_size = min(200, total_tasks // cpu_counts // 4)
             total_tasks = df_transcript3.shape[0]
+            chunk_size = min(200, total_tasks // cpu_counts // 4)
+            
             # results = pool.starmap(perChrom.translateCDSplusWithMut2, starmap_args, chunksize=100)
             imap_results = pool.imap(translate_wrapper, starmap_args, chunksize=chunk_size)
             if tqdm: