perChrom, reduce memory and increase speed for very large mutation file

ATPs · ATPs · commit b90b5596ad1c · 2025-01-23T14:47:42.000+08:00
diff --git a/src/perChrom.py b/src/perChrom.py
@@ -38,6 +38,46 @@ def parse_proteins(file_proteins):
     df_protein['protein_anno'] = df_protein.apply(lambda x:x['protein_description'].split(maxsplit=1)[-1] if x['protein_description'].split(maxsplit=1)[-1] != x['protein_id_fasta'] else '', axis=1)
     return df_protein
 
+def readExtraLargeMutationFile(file_mutations):
+    """Read large mutation CSV files (1GB+) with optimized memory usage for binary (0/1) columns
+    
+    Args:
+        file_mutations (str): Path to CSV file containing mutation data
+        
+    Returns:
+        pd.DataFrame: Optimized DataFrame with minimal memory usage
+    """
+    # Step 1: Read sample data to detect binary columns
+    sample = pd.read_csv(file_mutations, nrows=1000, sep='\t')
+    
+    # Step 2: Build dtype dictionary for binary columns
+    dtype_mapping = {}
+    for col in sample.columns:
+        if col in ['chr', 'pos', 'ref', 'alt']:
+            continue
+        unique_values = sample[col].dropna().unique()
+        
+        # Detect binary columns (0/1 only)
+        if set(unique_values).issubset({0, 1}):
+            # Use smallest unsigned int type for binary columns
+            dtype_mapping[col] = 'int8'
+    
+    # Step 3: Read full dataset with optimized dtypes
+    # Keep default dtypes for non-binary columns
+    df = pd.read_csv(
+        file_mutations,
+        dtype=dtype_mapping,
+        engine='c',  # Use C engine for faster parsing
+        true_values=['1'],  # Handle potential string representations
+        false_values=['0'],
+        keep_default_na=False,
+        sep='\t',
+        low_memory = True,
+        # nrows = 1000
+    )
+    
+        
+    return df
 
 
 def parse_mutation(file_mutations, chromosome=None):
@@ -78,7 +118,11 @@ def parse_mutation(file_mutations, chromosome=None):
         if chromosome:
             df_mutations['chr'] = chromosome
     else:
-        df_mutations = pd.read_csv(file_mutations, sep='\t', low_memory=False)
+        if os.path.getsize(file_mutations) < 1000000000:
+            df_mutations = pd.read_csv(file_mutations, sep='\t', low_memory=False)
+        else:
+            print(file_muations, 'very large file, use readExtraLargeMutationFile')
+            df_mutations = readExtraLargeMutationFile(file_mutations)
         df_mutations['pos_end'] = df_mutations['pos'] + df_mutations['ref'].str.len() - 1
         df_mutations = df_mutations.sort_values(by='pos')
         if chromosome: