@@ -38,6 +38,46 @@ def parse_proteins(file_proteins):
3838 df_protein ['protein_anno' ] = df_protein .apply (lambda x :x ['protein_description' ].split (maxsplit = 1 )[- 1 ] if x ['protein_description' ].split (maxsplit = 1 )[- 1 ] != x ['protein_id_fasta' ] else '' , axis = 1 )
3939 return df_protein
4040
41+ def readExtraLargeMutationFile (file_mutations ):
42+ """Read large mutation CSV files (1GB+) with optimized memory usage for binary (0/1) columns
43+
44+ Args:
45+ file_mutations (str): Path to CSV file containing mutation data
46+
47+ Returns:
48+ pd.DataFrame: Optimized DataFrame with minimal memory usage
49+ """
50+ # Step 1: Read sample data to detect binary columns
51+ sample = pd .read_csv (file_mutations , nrows = 1000 , sep = '\t ' )
52+
53+ # Step 2: Build dtype dictionary for binary columns
54+ dtype_mapping = {}
55+ for col in sample .columns :
56+ if col in ['chr' , 'pos' , 'ref' , 'alt' ]:
57+ continue
58+ unique_values = sample [col ].dropna ().unique ()
59+
60+ # Detect binary columns (0/1 only)
61+ if set (unique_values ).issubset ({0 , 1 }):
62+ # Use smallest unsigned int type for binary columns
63+ dtype_mapping [col ] = 'int8'
64+
65+ # Step 3: Read full dataset with optimized dtypes
66+ # Keep default dtypes for non-binary columns
67+ df = pd .read_csv (
68+ file_mutations ,
69+ dtype = dtype_mapping ,
70+ engine = 'c' , # Use C engine for faster parsing
71+ true_values = ['1' ], # Handle potential string representations
72+ false_values = ['0' ],
73+ keep_default_na = False ,
74+ sep = '\t ' ,
75+ low_memory = True ,
76+ # nrows = 1000
77+ )
78+
79+
80+ return df
4181
4282
4383def parse_mutation (file_mutations , chromosome = None ):
@@ -78,7 +118,11 @@ def parse_mutation(file_mutations, chromosome=None):
78118 if chromosome :
79119 df_mutations ['chr' ] = chromosome
80120 else :
81- df_mutations = pd .read_csv (file_mutations , sep = '\t ' , low_memory = False )
121+ if os .path .getsize (file_mutations ) < 1000000000 :
122+ df_mutations = pd .read_csv (file_mutations , sep = '\t ' , low_memory = False )
123+ else :
124+ print (file_muations , 'very large file, use readExtraLargeMutationFile' )
125+ df_mutations = readExtraLargeMutationFile (file_mutations )
82126 df_mutations ['pos_end' ] = df_mutations ['pos' ] + df_mutations ['ref' ].str .len () - 1
83127 df_mutations = df_mutations .sort_values (by = 'pos' )
84128 if chromosome :
0 commit comments