Skip to content

Commit b90b559

Browse files
committed
perChrom, reduce memory and increase speed for very large mutation file
1 parent a3e39dd commit b90b559

File tree

1 file changed

+45
-1
lines changed

1 file changed

+45
-1
lines changed

src/perChrom.py

Lines changed: 45 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,46 @@ def parse_proteins(file_proteins):
3838
df_protein['protein_anno'] = df_protein.apply(lambda x:x['protein_description'].split(maxsplit=1)[-1] if x['protein_description'].split(maxsplit=1)[-1] != x['protein_id_fasta'] else '', axis=1)
3939
return df_protein
4040

41+
def readExtraLargeMutationFile(file_mutations):
42+
"""Read large mutation CSV files (1GB+) with optimized memory usage for binary (0/1) columns
43+
44+
Args:
45+
file_mutations (str): Path to CSV file containing mutation data
46+
47+
Returns:
48+
pd.DataFrame: Optimized DataFrame with minimal memory usage
49+
"""
50+
# Step 1: Read sample data to detect binary columns
51+
sample = pd.read_csv(file_mutations, nrows=1000, sep='\t')
52+
53+
# Step 2: Build dtype dictionary for binary columns
54+
dtype_mapping = {}
55+
for col in sample.columns:
56+
if col in ['chr', 'pos', 'ref', 'alt']:
57+
continue
58+
unique_values = sample[col].dropna().unique()
59+
60+
# Detect binary columns (0/1 only)
61+
if set(unique_values).issubset({0, 1}):
62+
# Use smallest unsigned int type for binary columns
63+
dtype_mapping[col] = 'int8'
64+
65+
# Step 3: Read full dataset with optimized dtypes
66+
# Keep default dtypes for non-binary columns
67+
df = pd.read_csv(
68+
file_mutations,
69+
dtype=dtype_mapping,
70+
engine='c', # Use C engine for faster parsing
71+
true_values=['1'], # Handle potential string representations
72+
false_values=['0'],
73+
keep_default_na=False,
74+
sep='\t',
75+
low_memory = True,
76+
# nrows = 1000
77+
)
78+
79+
80+
return df
4181

4282

4383
def parse_mutation(file_mutations, chromosome=None):
@@ -78,7 +118,11 @@ def parse_mutation(file_mutations, chromosome=None):
78118
if chromosome:
79119
df_mutations['chr'] = chromosome
80120
else:
81-
df_mutations = pd.read_csv(file_mutations, sep='\t', low_memory=False)
121+
if os.path.getsize(file_mutations) < 1000000000:
122+
df_mutations = pd.read_csv(file_mutations, sep='\t', low_memory=False)
123+
else:
124+
print(file_muations, 'very large file, use readExtraLargeMutationFile')
125+
df_mutations = readExtraLargeMutationFile(file_mutations)
82126
df_mutations['pos_end'] = df_mutations['pos'] + df_mutations['ref'].str.len() - 1
83127
df_mutations = df_mutations.sort_values(by='pos')
84128
if chromosome:

0 commit comments

Comments
 (0)