You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
Copy file name to clipboardExpand all lines: src/PrecisionProDB.py
+3-3Lines changed: 3 additions & 3 deletions
Original file line number
Diff line number
Diff line change
@@ -5,7 +5,7 @@
5
5
importtime
6
6
7
7
description='''
8
-
PerGeno, personal proteogenomic tools which outputs a new reference protein based on the variants data.
8
+
PrecisionProDB, personal proteogenomic tools which outputs a new reference protein based on the variants data.
9
9
VCF/tsv file as the variant input
10
10
If the variant file is in tsv format, at least four columns are required in the header: chr, pos, ref, alt. Try to convert the file to proper format if you have a bed or other file.
11
11
'''
@@ -18,7 +18,7 @@
18
18
parser.add_argument('-p','--protein', help='protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', default='')
19
19
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
20
20
parser.add_argument('-o', '--out', help='''output prefix. Five files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences, two variant files from vcf. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa, {out}.vcf2mutation_1/2.tsv. default "perGeno" ''', default="perGeno")
21
-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
21
+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
22
22
parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
23
23
parser.add_argument('-F', '--no_filter', help='default only keep variant with value "PASS" FILTER column of vcf file. if set, do not filter', action='store_true')
24
24
parser.add_argument('-s', '--sample', help='sample name in the vcf to extract the variant information. default: None, extract the first sample', default=None)
description='''PerGeno, personal proteogenomic tools which outputs a new reference protein based on the variants data
587
+
description='''PrecisionProDB_core, personal proteogenomic tools which outputs a new reference protein based on the variants data
588
588
'''
589
589
590
590
if__name__=='__main__':
@@ -596,7 +596,7 @@ def runPerChom(self):
596
596
parser.add_argument('-p','--protein', help='protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', required=True)
597
597
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
598
598
parser.add_argument('-o', '--out', help='''output prefix. Three files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa. default "perGeno" ''', default="perGeno")
599
-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
599
+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
600
600
parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
description='''PerGenoVCF, personal proteogenomic tools which outputs a new reference protein based on the variants data. VCF file as the variant input
132
+
description='''PrecisionProDB_vcf, personal proteogenomic tools which outputs a new reference protein based on the variants data. VCF file as the variant input
133
133
'''
134
134
if__name__=='__main__':
135
135
importargparse
@@ -140,7 +140,7 @@ def runPerGenoVCF(
140
140
parser.add_argument('-p','--protein', help='protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', required=True)
141
141
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
142
142
parser.add_argument('-o', '--out', help='''output prefix. Five files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences, two variant files from vcf. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa, {out}.vcf2mutation_1/2.tsv. default "perGeno" ''', default="perGeno")
143
-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
143
+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
144
144
parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
145
145
parser.add_argument('-F', '--no_filter', help='default only keep variant with value "PASS" FILTER column of vcf file. if set, do not filter', action='store_true')
146
146
parser.add_argument('-s', '--sample', help='sample name in the vcf to extract the variant information. default: None, extract the first sample', default=None)
parser.add_argument('-r', '--files_ref', help='reference proteins to match with uniprot proteins. If more than one files, join by ","')
109
109
parser.add_argument('-a', '--files_alt', help='altered reference proteins. If more than one files, join by ",". The order should be the same as files_ref')
110
110
parser.add_argument('-o', '--outprefix', help='prefix for output files. default:"perGeno"', default='perGeno')
111
-
parser.add_argument('-m', '--length_min', help='output folder for the downloaded files. default: "20"', default=20, type=int)
111
+
parser.add_argument('-m', '--length_min', help='minumum length required when matching UniProt sequences with sequences in files_ref. default: "20"', default=20, type=int)
# sometimes the AA sequences for example ENSP00000466819, is longer than the CDSplus. skip.
425
-
iflen(AA_seq)*3>len(CDSplus):
425
+
iflen(AA_seq.strip('X'))*3>len(CDSplus):
426
426
print(transcript_id, 'input protein sequences cannot be translated from the CDS sequence in gtf annotation.')
427
427
428
428
ifstrand=='-':
@@ -695,7 +695,7 @@ def run_perChrom(self):
695
695
fout.close()
696
696
697
697
698
-
description='''output a new reference protein set by with the variants data
698
+
description='''output a new reference protein set by with the variants data for each chromosome. The input files were generated by PrecisionProDB_core.
699
699
'''
700
700
701
701
if__name__=='__main__':
@@ -708,7 +708,7 @@ def run_perChrom(self):
708
708
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
709
709
parser.add_argument('-o', '--out', help='output prefix. two file will be output. One is the annotation for mutated transcripts, one is the protein sequences. {out}.aa_mutations.csv, {out}.mutated_protein.fa')
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
711
+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
0 commit comments