Skip to content

Commit f2a4917

Browse files
committed
use memmap for extra large file
1 parent 8cf0869 commit f2a4917

File tree

1 file changed

+69
-18
lines changed

1 file changed

+69
-18
lines changed

src/perChromSqlite.py

Lines changed: 69 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -84,25 +84,56 @@ def get_protein_id_from_df_mutations(df_mutations, file_sqlite, cpu_counts=10):
8484

8585
return combined_results
8686

87-
def convert_df_transcript2_to_df_transcript3_helper(protein_id, df_transcript2, df_mutations, individual, **kargs):
87+
def convert_df_transcript2_to_df_transcript3_helper(protein_id, df_transcript2, df_mutations, individual, kargs):
8888
'''
8989
protein_id is a protein_id in df_transcript2
9090
for each protein_id, get the mutations in each individual.
9191
return a list of tuple, each tuple is (tuple of variant_index, individuals with this variant_index joined by ',')
92+
93+
If shape and file_memmap are provided in kargs, it means we're dealing with an extra large mutation file
94+
where individual data is stored in a memory-mapped file instead of in the DataFrame.
9295
'''
9396
mutations = df_transcript2.loc[protein_id]['mutations']
94-
tdf_m = df_mutations.loc[mutations]
95-
tdc = {}
96-
for sample in individual:
97-
if sample not in tdf_m.columns:
98-
print('warning:', sample, 'not in mutation columns and is ignored! unexpected error may happen!')
99-
continue
100-
tdf_m_single = tdf_m[tdf_m[sample].apply(is_valid)]
101-
variant_index = tuple(tdf_m_single.index)
102-
if len(variant_index) > 0:
103-
if variant_index not in tdc:
104-
tdc[variant_index] = []
105-
tdc[variant_index].append(sample)
97+
98+
# Check if we're using memory-mapped file for individual data
99+
if 'shape' in kargs and 'file_memmap' in kargs:
100+
# Using memory-mapped file for individual data
101+
shape = kargs['shape']
102+
file_memmap = kargs['file_memmap']
103+
104+
# Open the memory-mapped file in read mode
105+
mmap = np.memmap(file_memmap, dtype='int8', mode='r', shape=shape)
106+
107+
# Get the indices of mutations for this protein
108+
mutation_indices = mutations
109+
110+
tdc = {}
111+
for i, sample in enumerate(individual):
112+
# For each sample, check which mutations are valid (value is 1)
113+
valid_mutations = []
114+
for idx in mutation_indices:
115+
if idx < mmap.shape[0] and i < mmap.shape[1] and mmap[idx, i] == 1:
116+
valid_mutations.append(idx)
117+
118+
variant_index = tuple(valid_mutations)
119+
if len(variant_index) > 0:
120+
if variant_index not in tdc:
121+
tdc[variant_index] = []
122+
tdc[variant_index].append(sample)
123+
else:
124+
# Using regular DataFrame for individual data
125+
tdf_m = df_mutations.loc[mutations]
126+
tdc = {}
127+
for sample in individual:
128+
if sample not in tdf_m.columns:
129+
print('warning:', sample, 'not in mutation columns and is ignored! unexpected error may happen!')
130+
continue
131+
tdf_m_single = tdf_m[tdf_m[sample].apply(is_valid)]
132+
variant_index = tuple(tdf_m_single.index)
133+
if len(variant_index) > 0:
134+
if variant_index not in tdc:
135+
tdc[variant_index] = []
136+
tdc[variant_index].append(sample)
106137

107138
tdc = {k:','.join(v) for k,v in tdc.items()}
108139
return list(tdc.items())
@@ -114,6 +145,8 @@ def convert_df_transcript2_to_df_transcript3(df_transcript2, df_mutations, indiv
114145
else, group mutations in df_transcript2 by individual. rename index by adding __X, and add column 'individual'
115146
individual is a list of samples in df_mutations
116147
148+
If shape and file_memmap are provided in kargs, it means we're dealing with an extra large mutation file
149+
where individual data is stored in a memory-mapped file instead of in the DataFrame.
117150
'''
118151
if individual is None or individual == '' or individual == 'None' or individual == [] or individual == 'ALL_VARIANTS':
119152
return df_transcript2
@@ -122,17 +155,20 @@ def convert_df_transcript2_to_df_transcript3(df_transcript2, df_mutations, indiv
122155
individual = individual
123156
elif individual == 'ALL_SAMPLES':
124157
individual = [i for i in df_mutations.columns if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']]
125-
print('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:', individual)
158+
if len(individual) < 100:
159+
print('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:', individual)
160+
else:
161+
print('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. {} individuals were used'.format(len(individual)))
126162
elif ',' in individual:
127163
individual = individual.split(',')
128164
else:
129165
individual = [individual]
130166

131167
if cpu_counts == 1:
132-
results = [convert_df_transcript2_to_df_transcript3_helper(protein_id, df_transcript2, df_mutations, individual) for protein_id in df_transcript2.index]
168+
results = [convert_df_transcript2_to_df_transcript3_helper(protein_id, df_transcript2, df_mutations, individual, kargs) for protein_id in df_transcript2.index]
133169
else:
134170
pool = Pool(cpu_counts)
135-
results = pool.starmap(convert_df_transcript2_to_df_transcript3_helper, [(protein_id, df_transcript2, df_mutations, individual) for protein_id in df_transcript2.index])
171+
results = pool.starmap(convert_df_transcript2_to_df_transcript3_helper, [(protein_id, df_transcript2, df_mutations, individual, kargs) for protein_id in df_transcript2.index])
136172
pool.close()
137173
pool.join()
138174

@@ -227,7 +263,11 @@ def __init__(
227263
individual = individual
228264
elif individual == 'ALL_SAMPLES':
229265
individual = [i for i in df_mutations.columns if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']]
230-
print('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:', individual)
266+
if len(individual) < 100:
267+
print('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. individuals were set as:', individual)
268+
else:
269+
print('warning: individual is ALL_SAMPLES, all columns in file_mutations other than chr pos ref alt were used as individuals, which may cause problem. {} individuals were used'.format(len(individual)))
270+
231271
elif ',' in individual:
232272
individual = individual.split(',')
233273
elif individual in df_mutations.columns:
@@ -245,6 +285,15 @@ def __init__(
245285
tsv2memmap(file_mutations, individuals = self.individual, memmap_file=file_mutations + '.memmap')
246286
self.shape = shape
247287
self.file_memmap = file_mutations + '.memmap'
288+
289+
if chromosome:
290+
chromosome_with_genomicLocs = buildSqlite.get_genomicLocs_chromosomes(file_sqlite)
291+
if chromosome in chromosome_with_genomicLocs:
292+
self.df_mutations['chr'] = chromosome
293+
elif chromosome.startswith('chr') and chromosome[3:] in chromosome_with_genomicLocs:
294+
self.df_mutations['chr'] = chromosome[3:]
295+
else:
296+
print(f'warning! chromosome {chromosome} not in file_sqlite')
248297

249298

250299

@@ -291,7 +340,7 @@ def run_perChrom(self, save_results = True):
291340
df_transcript2.at[transcript_id, 'mutations'] = []
292341
df_transcript2 = df_transcript2[~df_transcript2.index.isin(set(tdf_special.index))]
293342

294-
if not self.extra_large_file_mutation
343+
if not self.extra_large_file_mutation:
295344
df_transcript3 = convert_df_transcript2_to_df_transcript3(df_transcript2, df_mutations, individual = individual, cpu_counts=cpu_counts)
296345
else:
297346
df_transcript3 = convert_df_transcript2_to_df_transcript3(df_transcript2, df_mutations, individual = individual, cpu_counts=cpu_counts, shape=self.shape, file_memmap = self.file_memmap)
@@ -341,6 +390,8 @@ def main():
341390
parser.add_argument('-s', '--sample', help='''
342391
sample name in the vcf to extract the variant information. default: None, use all variants and do not consider samples.
343392
For multiple samples, use "," to join the sample names. For example, "--sample sample1,sample2,sample3".
393+
To use all samples, use "--sample ALL_SAMPLES".
394+
To use all variants regardless where the variants from, use "--sample ALL_VARIANTS".
344395
''', default=None)
345396

346397
if TEST:

0 commit comments

Comments
 (0)