Skip to content

Commit 215fac4

Browse files
committed
add --info_field --info_field_thres option like ProHap
1 parent 03170dc commit 215fac4

File tree

3 files changed

+234
-62
lines changed

3 files changed

+234
-62
lines changed

src/PrecisionProDB.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,8 @@ def main():
4444
parser.add_argument('--keep_all', help='If set, do not delete files generated during the run', action='store_true')
4545

4646
parser.add_argument('-S','--sqlite', help='''A path of sqlite file for re-use of annotation info. default '', do not use sqlite. The program will create a sqlite file if the file does not exist. If the file already exists, the program will use data stored in the file. It will cause error if the content in the sqlite file is not as expected. ''', default='', type=str)
47+
parser.add_argument('--info_field', help='fields to use in the INFO column of the vcf file to filter variants. Default None', default = None)
48+
parser.add_argument('--info_field_thres', help='threhold for the info field. Default None, do not filter any variants. If set "--info_filed AF --info_field_thres 0.01", only keep variants with AF >= 0.01', default = None)
4749

4850

4951
f = parser.parse_args()
@@ -165,7 +167,7 @@ def main():
165167
# use Sqlite
166168
import PrecisionProDB_Sqlite
167169
print('using sqlite database to speed up')
168-
PrecisionProDB_Sqlite.main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite)
170+
PrecisionProDB_Sqlite.main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite, info_field = f.info_field, info_field_thres = f.info_field_thres)
169171

170172
pattern = re.compile(r'(chr)?(\d+)-(\d+)-([A-Za-z]+)-([A-Za-z]+)')
171173
match = pattern.match(file_mutations)

src/PrecisionProDB_Sqlite.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -134,15 +134,15 @@ def runPerChomSqlite(file_sqlite, file_mutations, threads, outprefix, protein_ke
134134

135135
print('finished!')
136136

137-
def runPerChomSqlite_vcf(file_mutations, file_sqlite, threads, outprefix, datatype, protein_keyword, keep_all, individual, chromosome_only, filter_PASS, chromosomes_genome, chromosomes_genome_description, file_gtf):
137+
def runPerChomSqlite_vcf(file_mutations, file_sqlite, threads, outprefix, datatype, protein_keyword, keep_all, individual, chromosome_only, filter_PASS, chromosomes_genome, chromosomes_genome_description, file_gtf, info_field = None, info_field_thres=None):
138138
'''
139139
'''
140140
from vcf2mutation import convertVCF2MutationComplex
141141
from PrecisionProDB_vcf import readProtein2DF, openFile
142142
# get two mutation files from vcf file
143143
print('start extracting mutation file from the vcf input')
144144
outprefix_vcf = outprefix + '.vcf2mutation'
145-
individual = convertVCF2MutationComplex(file_vcf = file_mutations, outprefix = outprefix_vcf, individual=individual, filter_PASS = filter_PASS, chromosome_only = chromosome_only)
145+
individual = convertVCF2MutationComplex(file_vcf = file_mutations, outprefix = outprefix_vcf, individual_input=individual, filter_PASS = filter_PASS, chromosome_only = chromosome_only, info_field = info_field, info_field_thres=info_field_thres, threads = threads)
146146
individual = ','.join(individual)
147147
print('finished extracting mutations from the vcf file')
148148
file_mutations = outprefix + '.vcf2mutation.tsv'
@@ -221,7 +221,7 @@ def check_sqlite_file(file_path):
221221
if 'conn' in locals():
222222
conn.close()
223223

224-
def main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite):
224+
def main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite, info_field=None, info_field_thres=None):
225225
'''
226226
the main function of PrecisionProDB_Sqlite
227227
'''
@@ -303,7 +303,9 @@ def main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protei
303303
chromosome_only = chromosome_only,
304304
chromosomes_genome=chromosomes_genome,
305305
chromosomes_genome_description=chromosomes_genome_description,
306-
file_gtf=file_gtf
306+
file_gtf=file_gtf,
307+
info_field=info_field,
308+
info_field_thres=info_field_thres
307309
)
308310
else:
309311
print('variant file is a tsv file')
@@ -356,6 +358,9 @@ def main():
356358
parser.add_argument('-A','--all_chromosomes', help='default keep variant in chromosomes and ignore those in short fragments of the genome. if set, use all chromosomes including fragments when parsing the vcf file', action='store_true')
357359
parser.add_argument('--keep_all', help='If set, do not delete files generated during the run', action='store_true')
358360
parser.add_argument('-S','--sqlite', help='''A path of sqlite file for re-use of annotation info. default '', do not use sqlite. The program will create a sqlite file if the file does not exist. If the file already exists, the program will use data stored in the file. It will cause error if the content in the sqlite file is not as expected. ''', default='', type=str)
361+
parser.add_argument('--info_field', help='fields to use in the INFO column of the vcf file to filter variants. Default None', default = None)
362+
parser.add_argument('--info_field_thres', help='threhold for the info field. Default None, do not filter any variants. If set "--info_filed AF --info_field_thres 0.01", only keep variants with AF >= 0.01', default = None)
363+
359364

360365
if TEST:
361366
f = parser.parse_args(f"-g {file_genome} -f {file_gtf} -m {file_mutations} -p {file_protein} -t {threads} -o {outprefix} -a {datatype} -k {protein_keyword} -F --keep_all -S {file_sqlite}".split())
@@ -378,7 +383,7 @@ def main():
378383
print(description)
379384
print(f)
380385

381-
main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite)
386+
main_PrecsionProDB_Sqlite(file_genome, file_gtf, file_mutations, file_protein, threads, outprefix, datatype, protein_keyword, filter_PASS, individual, chromosome_only, keep_all, file_sqlite, info_field=f.info_field, info_field_thres=f.info_field_thres)
382387

383388
if __name__ == '__main__':
384389
main()

0 commit comments

Comments
 (0)