|
13 | 13 | from PrecisionProDB_core import PerGeno, get_k_new |
14 | 14 | import re |
15 | 15 | import sqlite3 |
| 16 | +from multiprocessing import Pool |
16 | 17 |
|
17 | 18 | # code below for testing the the program |
18 | 19 | TEST = False |
@@ -86,14 +87,29 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke |
86 | 87 | pergeno.chromosomes_genome = chromosomes_genome |
87 | 88 | chromosomes_mutation = pergeno.splitMutationByChromosomeLarge(chromosomes_genome_description=chromosomes_genome_description, chromosomes_genome=chromosomes_genome) |
88 | 89 |
|
| 90 | + # run tsv2memmap here with multiple threads to save time |
| 91 | + files_mutation_to_convert = [f'{tempfolder}/{i}.mutation.tsv' for i in chromosomes_mutation] |
| 92 | + files_mutation_to_convert = [i for i in files_mutation_to_convert if os.path.getsize(i) > 100000000] |
| 93 | + if len(files_mutation_to_convert) > 0 and threads > 1 and individual !='': |
| 94 | + if individual == 'ALL_SAMPLES': |
| 95 | + columns_in_file_mutation = openFile(files_mutation_to_convert[0], 'r').readline().strip().split('\t') |
| 96 | + individual_for_memmap = [i for i in columns_in_file_mutation if i not in ['chr', 'pos', '', 'ref', 'alt', 'pos_end']] |
| 97 | + else: |
| 98 | + individual_for_memmap = individual |
| 99 | + from vcf2mutation import tsv2memmap |
| 100 | + pool = Pool(threads) |
| 101 | + pool.starmap(tsv2memmap, [(i, individual_for_memmap, i +'.memmap') for i in files_mutation_to_convert], chunksize=1) |
| 102 | + pool.close() |
| 103 | + pool.join() |
| 104 | + |
89 | 105 | # run runSinglePerChromSqlite |
90 | 106 | chromosomes_mutated = [runSinglePerChromSqlite(file_sqlite, f'{tempfolder}/{chromosome}.mutation.tsv', tempfolder, threads, chromosome, datatype, individual) for chromosome in chromosomes_mutation] |
91 | 107 | # successful chromosomes |
92 | 108 | chromosomes_mutated = [e for e in chromosomes_mutated if e is not None] |
93 | 109 | # collect mutation annotations |
94 | 110 | files_mutAnno = ['{}/{}.aa_mutations.csv'.format(tempfolder, chromosome) for chromosome in chromosomes_mutated] |
95 | 111 | file_mutAnno = outprefix + '.pergeno.aa_mutations.csv' |
96 | | - df_mutAnno = pd.concat([pd.read_csv(f, sep='\t') for f in files_mutAnno if os.path.exists(f)], ignore_index=True) |
| 112 | + df_mutAnno = pd.concat([pd.read_csv(f, sep='\t', low_memory=False) for f in files_mutAnno if os.path.exists(f)], ignore_index=True) |
97 | 113 | print('total number of proteins with AA mutation:', df_mutAnno.shape[0]) |
98 | 114 | df_mutAnno.to_csv(file_mutAnno, sep='\t', index=None) |
99 | 115 |
|
|
0 commit comments