Add force_memmap support for large mutation files to increase speed

ATPs · ATPs · commit d6d21b152d6b · 2025-12-03T14:41:35.000+08:00
diff --git a/src/PrecisionProDB_Sqlite.py b/src/PrecisionProDB_Sqlite.py
@@ -39,7 +39,7 @@
     file_sqlite = '/XCLabServer002_fastIO/examples/GENCODE/GENCODE.tsv.sqlite'
 
 
-def runSinglePerChromSqlite(file_sqlite, file_mutations, tempfolder, threads, chromosome, datatype, individual):
+def runSinglePerChromSqlite(file_sqlite, file_mutations, tempfolder, threads, chromosome, datatype, individual, force_memmap=False):
     '''
     run PerChrom_sqlite for a single chromosome
     '''
@@ -51,7 +51,8 @@ def runSinglePerChromSqlite(file_sqlite, file_mutations, tempfolder, threads, ch
                     outprefix = outprefix,
                     datatype = datatype,
                     chromosome = chromosome,
-                    individual = individual)
+                    individual = individual,
+                    force_memmap = force_memmap)
     print('run perchrom_sqlite for chromosome', chromosome)
     # perchrom_sqlite.run_perChrom()
     # print('finished running perchrom_sqlite for chromosome:', chromosome)
@@ -65,7 +66,7 @@ def runSinglePerChromSqlite(file_sqlite, file_mutations, tempfolder, threads, ch
         return None
 
 
-def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_keyword, datatype, keep_all, individual, chromosomes_genome, chromosomes_genome_description, file_gtf):
+def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_keyword, datatype, keep_all, individual, chromosomes_genome, chromosomes_genome_description, file_gtf, force_memmap=False):
     '''
     '''
     # split file_mutations by chromosome
@@ -83,27 +84,43 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
     
     if individual == 'ALL_VARIANTS':
         individual = ''
+    # only force memmap when sample information is retained
+    use_force_memmap = force_memmap and individual not in ['', None]
     tempfolder = pergeno.tempfolder
     pergeno.chromosomes_genome = chromosomes_genome
-    chromosomes_mutation = pergeno.splitMutationByChromosomeLarge(chromosomes_genome_description=chromosomes_genome_description, chromosomes_genome=chromosomes_genome)
+    individual_columns = None
+    if isinstance(individual, str) and individual not in ['', 'None', 'ALL_SAMPLES']:
+        individual_columns = [i.strip() for i in individual.split(',') if i.strip()]
+    elif isinstance(individual, (list, tuple)):
+        individual_columns = [i for i in individual if i]
+    chromosomes_mutation = pergeno.splitMutationByChromosomeLarge(
+        chromosomes_genome_description=chromosomes_genome_description,
+        chromosomes_genome=chromosomes_genome,
+        individual_columns=individual_columns,
+        enable_memmap=use_force_memmap
+    )
 
     # run tsv2memmap here with multiple threads to save time
     files_mutation_to_convert = [f'{tempfolder}/{i}.mutation.tsv' for i in chromosomes_mutation]
     files_mutation_to_convert = [i for i in files_mutation_to_convert if os.path.getsize(i) > 100000000]
-    if len(files_mutation_to_convert) > 0 and threads > 1 and individual !='':
-        if individual == 'ALL_SAMPLES':
-            columns_in_file_mutation = openFile(files_mutation_to_convert[0], 'r').readline().strip().split('\t')
-            individual_for_memmap = [i for i in columns_in_file_mutation if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']]
+    if len(files_mutation_to_convert) > 0 and threads > 1 and individual !='' and not use_force_memmap:
+        files_mutation_to_convert = [i for i in files_mutation_to_convert if not os.path.exists(i + '.memmap')]
+        if len(files_mutation_to_convert) == 0:
+            print('memmap files already present for large chromosomes, skip regeneration.')
         else:
-            individual_for_memmap = individual
-        from vcf2mutation import tsv2memmap
-        pool = Pool(threads)
-        pool.starmap(tsv2memmap, [(i, individual_for_memmap, i +'.memmap') for i in files_mutation_to_convert], chunksize=1)
-        pool.close()
-        pool.join()
+            if individual == 'ALL_SAMPLES':
+                columns_in_file_mutation = openFile(files_mutation_to_convert[0], 'r').readline().strip().split('\t')
+                individual_for_memmap = [i for i in columns_in_file_mutation if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']]
+            else:
+                individual_for_memmap = individual
+            from vcf2mutation import tsv2memmap
+            pool = Pool(threads)
+            pool.starmap(tsv2memmap, [(i, individual_for_memmap, i +'.memmap') for i in files_mutation_to_convert], chunksize=1)
+            pool.close()
+            pool.join()
     
     # run runSinglePerChromSqlite
-    chromosomes_mutated = [runSinglePerChromSqlite(file_sqlite, f'{tempfolder}/{chromosome}.mutation.tsv', tempfolder, threads, chromosome, datatype, individual) for chromosome in chromosomes_mutation]
+    chromosomes_mutated = [runSinglePerChromSqlite(file_sqlite, f'{tempfolder}/{chromosome}.mutation.tsv', tempfolder, threads, chromosome, datatype, individual, force_memmap=use_force_memmap) for chromosome in chromosomes_mutation]
     # successful chromosomes
     chromosomes_mutated = [e for e in chromosomes_mutated if e is not None]
     # collect mutation annotations
@@ -115,8 +132,22 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
 
     # collect individual information for later use when adding unchanged proteins per individual
     all_individuals = []
+    base_columns = {'chr', 'pos', 'ref', 'alt', 'pos_end'}
     if isinstance(individual, str):
-        if individual not in ['', 'None']:
+        if individual == 'ALL_SAMPLES':
+            # Derive the actual sample columns from one of the split mutation files so
+            # unchanged proteins can still be emitted when some individuals remain ref-only.
+            sample_header_file = None
+            for chromosome in chromosomes_mutation:
+                candidate = f'{tempfolder}/{chromosome}.mutation.tsv'
+                if os.path.exists(candidate):
+                    sample_header_file = candidate
+                    break
+            if sample_header_file:
+                with openFile(sample_header_file, 'r') as fo:
+                    header_columns = fo.readline().strip().split('\t')
+                all_individuals = [col for col in header_columns if col not in base_columns]
+        elif individual not in ['', 'None']:
             all_individuals = [i.strip() for i in individual.split(',') if i.strip()]
     elif isinstance(individual, (list, tuple)):
         all_individuals = [i for i in individual if i]
@@ -190,6 +221,7 @@ def runPerChomSqlite_vcf(file_mutations, file_sqlite, threads, outprefix, dataty
     # get two mutation files from vcf file
     print('start extracting mutation file from the vcf input')
     outprefix_vcf = outprefix + '.vcf2mutation'
+    force_memmap = (individual == 'ALL_SAMPLES')
     individual = convertVCF2MutationComplex(file_vcf = file_mutations, outprefix = outprefix_vcf, individual_input=individual, filter_PASS = filter_PASS, chromosome_only = chromosome_only, info_field = info_field, info_field_thres=info_field_thres, threads = threads)
     individual = ','.join(individual)
     print('finished extracting mutations from the vcf file')
@@ -208,7 +240,8 @@ def runPerChomSqlite_vcf(file_mutations, file_sqlite, threads, outprefix, dataty
                      individual, 
                      chromosomes_genome, 
                      chromosomes_genome_description, 
-                     file_gtf
+                     file_gtf,
+                     force_memmap=force_memmap
                      )
     
 
@@ -368,7 +401,8 @@ def main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protei
             individual=individual,
             chromosomes_genome=chromosomes_genome,
             chromosomes_genome_description=chromosomes_genome_description,
-            file_gtf=file_gtf
+            file_gtf=file_gtf,
+            force_memmap=(individual == 'ALL_SAMPLES')
             )
 
 
diff --git a/src/PrecisionProDB_core.py b/src/PrecisionProDB_core.py
@@ -7,6 +7,7 @@
 from perChrom import PerChrom
 import shutil
 import re
+from array import array
 
 def get_k_new(k, chromosomes_genome, chromosomes_genome_description):
     '''k is chromosome name. return chromosome name based on chromosomes_genome, chromosomes_genome_description
@@ -391,9 +392,12 @@ def splitMutationByChromosome(self, chromosomes_genome_description=None, chromos
         print('finish splitting the mutation file')
         return chromosomes_mutation
 
-    def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, chromosomes_genome=None):
+    def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, chromosomes_genome=None, individual_columns=None, enable_memmap=False):
         '''split mutation file based on chromosomes
         file_mutations is generated from vcf file, no need to read with pandas or further processing
+        individual_columns is an optional list of sample columns that should be written
+        to memmap files while the TSV is being split. enable_memmap toggles this streamed
+        writer to avoid re-reading the large TSV with pandas later on.
         '''
         tempfolder = self.tempfolder
         file_splitMutationByChromosomeLarge_done = os.path.join(tempfolder,'splitMutationByChromosomeLarge.done')
@@ -409,10 +413,25 @@ def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, ch
             chromosomes_genome_description = self.chromosomes_genome_description
         
         dc_output = {}
+        memmap_writers = {}
+        sample_column_indices = []
         fo = openFile(file_mutations)
         for line in fo:
             break
         header = line
+        header_columns = header.strip().split('\t')
+        if enable_memmap:
+            if individual_columns:
+                name_to_index = {name: idx for idx, name in enumerate(header_columns)}
+                sample_column_indices = [name_to_index[name] for name in individual_columns if name in name_to_index]
+                if len(sample_column_indices) != len(individual_columns):
+                    missing = set(individual_columns) - set(name_to_index)
+                    if missing:
+                        print('warning: columns not found for memmap writing:', ','.join(missing))
+            else:
+                base_columns = {'chr', 'pos', 'ref', 'alt', 'pos_end'}
+                # fallback: treat every column beyond the required fields as sample genotype
+                sample_column_indices = [idx for idx, name in enumerate(header_columns) if name not in base_columns]
         for line in fo:
             k = line.split('\t', maxsplit=1)[0]
             k_new = get_k_new(k, chromosomes_genome, chromosomes_genome_description)
@@ -421,15 +440,65 @@ def splitMutationByChromosomeLarge(self, chromosomes_genome_description=None, ch
                 dc_output[k_new] = open(tf,'w')
                 dc_output[k_new].write(header)
             dc_output[k_new].write(line)
+            if enable_memmap and sample_column_indices:
+                if k_new not in memmap_writers:
+                    memmap_filename = tf + '.memmap'
+                    memmap_writers[k_new] = _ChromosomeMemmapWriter(memmap_filename, len(sample_column_indices))
+                line_values = line.strip().split('\t')
+                sample_values = [line_values[idx] if idx < len(line_values) else '0' for idx in sample_column_indices]
+                memmap_writers[k_new].add_row(sample_values)
         
+        fo.close()
         for k_new in dc_output:
             dc_output[k_new].close()
+        for writer in memmap_writers.values():
+            writer.finalize()
         chromosomes_mutation = list(dc_output.keys())
         
         print('finish splitting the mutation file')
         open(file_splitMutationByChromosomeLarge_done,'w').write('\n'.join(chromosomes_mutation))
         return chromosomes_mutation
 
+class _ChromosomeMemmapWriter:
+    """Stream sample matrices into byte-aligned files for later memmap usage."""
+
+    def __init__(self, filename, n_cols):
+        """
+        Args:
+            filename (str): Destination path for the raw binary matrix.
+            n_cols (int): Number of sample columns stored per row.
+        """
+        self.filename = filename
+        self.n_cols = n_cols
+        self.handle = open(filename, 'wb')
+        self.rows = 0
+
+    def add_row(self, values):
+        """
+        Append a single row of sample indicators to the binary file.
+
+        Args:
+            values (Iterable[str]): Raw string values from the TSV columns.
+        """
+        row_array = array(
+            'b',
+            (1 if value not in ('', '0', '0.0', '.', 'False') else 0 for value in values)
+        )
+        if len(row_array) != self.n_cols:
+            raise ValueError(f'mismatched memmap width: expected {self.n_cols}, got {len(row_array)}')
+        row_array.tofile(self.handle)
+        self.rows += 1
+
+    def finalize(self):
+        """Close file handle and create the companion .done flag."""
+        self.handle.close()
+        open(self.filename + '.done', 'w').close()
+
+    def __del__(self):
+        """Ensure file handle closes if finalize is not called explicitly."""
+        if not self.handle.closed:
+            self.handle.close()
+
     def splitGtfByChromosomes(self,dc_protein2chr):
         '''split gtf file based on chromosome. only keep proteins in file_protein
         '''
@@ -740,4 +809,4 @@ def main():
     pergeno.runPerChom()
 
 if __name__ == '__main__':
-    main()
+    main()
diff --git a/src/perChromSqlite.py b/src/perChromSqlite.py
@@ -17,6 +17,8 @@
 except:
     print('Cannot import tqdm. Will not use tqdm.')
     tqdm = False
+
+_MEMMAP_CACHE = {}
 # # Global variables
 # con = None
 # df_mutations = None
@@ -88,42 +90,54 @@ def get_protein_id_from_df_mutations(df_mutations, file_sqlite, cpu_counts=10):
     
     return combined_results
 
+def _load_memmap(file_memmap, shape):
+    """Cache numpy.memmap instances per-process to avoid reopening the file repeatedly."""
+    global _MEMMAP_CACHE
+    cache_key = (file_memmap, shape)
+    if cache_key not in _MEMMAP_CACHE:
+        _MEMMAP_CACHE[cache_key] = np.memmap(file_memmap, dtype='int8', mode='r', shape=shape)
+    return _MEMMAP_CACHE[cache_key]
+
+
 def convert_df_transcript2_to_df_transcript3_helper(protein_id, df_transcript2, df_mutations, individual, kargs):
     '''
-    protein_id is a protein_id in df_transcript2
-    for each protein_id, get the mutations in each individual.
-    return a list of tuple, each tuple is (tuple of variant_index, individuals with this variant_index joined by ',')
-    
-    If shape and file_memmap are provided in kargs, it means we're dealing with an extra large mutation file
-    where individual data is stored in a memory-mapped file instead of in the DataFrame.
+    Convert mutation membership for a single protein into grouped variant patterns.
+
+    The function inspects all variants linked to the provided protein and determines,
+    for every individual, which subset of those variants is carried. Individuals sharing
+    the same variant combination are collapsed into a single entry so downstream
+    translation only needs to be executed once per unique pattern. When a memory-mapped
+    allele matrix is available, the lookups are vectorized to avoid per-sample Python
+    loops.
     '''
     mutations = df_transcript2.loc[protein_id]['mutations']
     
     # Check if we're using memory-mapped file for individual data
     if 'shape' in kargs and 'file_memmap' in kargs:
-        # Using memory-mapped file for individual data
         shape = kargs['shape']
         file_memmap = kargs['file_memmap']
-        
-        # Open the memory-mapped file in read mode
-        mmap = np.memmap(file_memmap, dtype='int8', mode='r', shape=shape)
-        
-        # Get the indices of mutations for this protein
-        mutation_indices = mutations
-        
+        mmap = _load_memmap(file_memmap, shape)
+        mutation_indices = np.array(mutations, dtype=int)
+        if mutation_indices.size == 0:
+            return []
+        allele_block = mmap[mutation_indices, :]
+        allele_block = allele_block if allele_block.ndim == 2 else allele_block.reshape(1, -1)
+        # Locate every (variant, sample) pair where the allele is present.
+        presence_coords = np.argwhere(allele_block == 1)
+        if presence_coords.size == 0:
+            return []
+        sample_to_variants = {}
+        for variant_pos, sample_idx in presence_coords:
+            variant_idx = mutation_indices[variant_pos]
+            sample_to_variants.setdefault(sample_idx, []).append(variant_idx)
         tdc = {}
-        for i, sample in enumerate(individual):
-            # For each sample, check which mutations are valid (value is 1)
-            valid_mutations = []
-            for idx in mutation_indices:
-                if idx < mmap.shape[0] and i < mmap.shape[1] and mmap[idx, i] == 1:
-                    valid_mutations.append(idx)
-            
-            variant_index = tuple(valid_mutations)
-            if len(variant_index) > 0:
-                if variant_index not in tdc:
-                    tdc[variant_index] = []
-                tdc[variant_index].append(sample)
+        for sample_idx, variant_list in sample_to_variants.items():
+            if not variant_list:
+                continue
+            variant_index = tuple(sorted(variant_list))
+            if len(variant_index) == 0:
+                continue
+            tdc.setdefault(variant_index, []).append(individual[sample_idx])
     else:
         # Using regular DataFrame for individual data
         tdf_m = df_mutations.loc[mutations]
@@ -236,7 +250,8 @@ def __init__(
                     outprefix,
                     datatype,
                     chromosome,
-                    individual = None
+                    individual = None,
+                    force_memmap = False
                 ):
         self.file_sqlite = file_sqlite # genome file location
         self.file_mutations = file_mutations # mutation file location
@@ -245,9 +260,12 @@ def __init__(
         self.datatype = datatype # input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq or gtf
         self.chromosome = chromosome # chromosome name
         self.extra_large_file_mutation = False # whether the mutation file is larger than 1G
+        self.force_memmap = force_memmap
         
         # if file_mutation is larger than 1G, only read ['chr', 'pos', 'ref', 'alt']
+        file_memmap_path = None
         if isinstance(self.file_mutations, str):
+            file_memmap_path = self.file_mutations + '.memmap'
             if os.path.exists(self.file_mutations):
                 if os.path.getsize(self.file_mutations) > 100000000:
                     self.extra_large_file_mutation = True
@@ -289,11 +307,19 @@ def __init__(
         
         self.individual = individual
         
+        # Determine whether we already built memmap data for this file or if it should be forced.
+        memmap_exists = bool(file_memmap_path and os.path.exists(file_memmap_path))
+        if not self.individual:
+            self.extra_large_file_mutation = False
+        else:
+            self.extra_large_file_mutation = bool(self.individual) and (self.extra_large_file_mutation or memmap_exists or self.force_memmap)
+        
         if self.extra_large_file_mutation:
             self.df_mutations = perChrom.parse_mutation(file_mutations, columns_to_include=['chr', 'pos', 'ref', 'alt'])
             from vcf2mutation import tsv2memmap
             shape = (self.df_mutations.shape[0], len(self.individual))
-            tsv2memmap(file_mutations, individuals = self.individual, memmap_file=file_mutations + '.memmap')
+            if not memmap_exists:
+                tsv2memmap(file_mutations, individuals = self.individual, memmap_file=file_mutations + '.memmap')
             self.shape = shape
             self.file_memmap = file_mutations + '.memmap'
         
@@ -445,4 +471,4 @@ def main():
     perchrom_sqlite.run_perChrom()
 
 if __name__ == '__main__':
-    main()
+    main()