Skip to content

Commit 9531062

Browse files
committed
update some annotations
1 parent 63d0b63 commit 9531062

File tree

6 files changed

+21
-21
lines changed

6 files changed

+21
-21
lines changed

src/PrecisionProDB.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
import time
66

77
description = '''
8-
PerGeno, personal proteogenomic tools which outputs a new reference protein based on the variants data.
8+
PrecisionProDB, personal proteogenomic tools which outputs a new reference protein based on the variants data.
99
VCF/tsv file as the variant input
1010
If the variant file is in tsv format, at least four columns are required in the header: chr, pos, ref, alt. Try to convert the file to proper format if you have a bed or other file.
1111
'''
@@ -18,7 +18,7 @@
1818
parser.add_argument('-p','--protein', help = 'protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', default='')
1919
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
2020
parser.add_argument('-o', '--out', help='''output prefix. Five files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences, two variant files from vcf. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa, {out}.vcf2mutation_1/2.tsv. default "perGeno" ''', default="perGeno")
21-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
21+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
2222
parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
2323
parser.add_argument('-F', '--no_filter', help='default only keep variant with value "PASS" FILTER column of vcf file. if set, do not filter', action='store_true')
2424
parser.add_argument('-s', '--sample', help='sample name in the vcf to extract the variant information. default: None, extract the first sample', default=None)
@@ -102,4 +102,4 @@
102102
extractMutatedUniprot.extractMutatedUniprot(files_uniprot=files_uniprot, files_ref=file_protein, files_alt=outprefix + '.pergeno.protein_all.fa', outprefix=outprefix, length_min = 20)
103103

104104

105-
print('perGeno finished!, total seconds:', time.time() - time0)
105+
print('PrecisionProDB finished!, total seconds:', time.time() - time0)

src/PrecisionProDB_core.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -308,6 +308,8 @@ def splitMutationByChromosome(self):
308308
for k,v in df_mutations.groupby('chr'):
309309
v = v.copy()
310310
chromosomes_mutation.append(k)
311+
if k.startswith('chr'):
312+
k = k[3:]
311313
if k in chromosomes_genome:
312314
tf = os.path.join(tempfolder, k + '.mutation.tsv')
313315
elif 'chr' + k in chromosomes_genome:
@@ -316,8 +318,6 @@ def splitMutationByChromosome(self):
316318
v['chr'] = v['chr'].apply(lambda x:'chr' + str(x))
317319
else:
318320
print('chromosomes in mutation file is different from the genome. try to solve that. This is usually True if datatype is RefSeq')
319-
if k.startswith('chr'):
320-
k = k[3:]
321321
if k == 'M':
322322
for e in chromosomes_genome_description:
323323
e1, e2 = e.split(' ', maxsplit=1)
@@ -333,7 +333,7 @@ def splitMutationByChromosome(self):
333333
print(f' mutation chromosome change {k} to {k_new}')
334334
break
335335
else:
336-
print('chromosome in mutation file', k, 'cannot find corresponding chromosome in genome file. please check')
336+
print('chromosomes in mutation file', k, 'cannot find corresponding chromosome in genome file. please check')
337337
k_new = k
338338
tf = os.path.join(tempfolder, k_new + '.mutation.tsv')
339339

@@ -580,11 +580,11 @@ def runPerChom(self):
580580
# clear temp folder
581581
shutil.rmtree(self.tempfolder)
582582

583-
print('perGeno finished!')
583+
print('finished!')
584584

585585

586586

587-
description = '''PerGeno, personal proteogenomic tools which outputs a new reference protein based on the variants data
587+
description = '''PrecisionProDB_core, personal proteogenomic tools which outputs a new reference protein based on the variants data
588588
'''
589589

590590
if __name__ == '__main__':
@@ -596,7 +596,7 @@ def runPerChom(self):
596596
parser.add_argument('-p','--protein', help = 'protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', required=True)
597597
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
598598
parser.add_argument('-o', '--out', help='''output prefix. Three files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa. default "perGeno" ''', default="perGeno")
599-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
599+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
600600
parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
601601
f = parser.parse_args()
602602

src/PrecisionProDB_vcf.py

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -49,14 +49,14 @@ def runPerGenoVCF(
4949
file_mutations_2 = outprefix_vcf + '_2.tsv'
5050

5151
# run perGeno for mutations_1
52-
print('start running perGeno for first strand of the genome mutation file')
52+
print('start running PrecisionProDB for first strand of the genome mutation file')
5353
outprefix_1 = outprefix + '_1'
5454
pergeno_1 = PerGeno(file_genome = file_genome, file_gtf=file_gtf, file_mutations = file_mutations_1, file_protein=file_protein, threads=threads, outprefix=outprefix_1, datatype=datatype, protein_keyword=protein_keyword)
5555
pergeno_1.splitInputByChromosomes()
5656
pergeno_1.runPerChom()
5757

5858
# run perGeno for mutations_2
59-
print('start running perGeno for second strand of the genome mutation file')
59+
print('start running PrecisionProDB for second strand of the genome mutation file')
6060
outprefix_2 = outprefix + '_2'
6161
pergeno_2 = PerGeno(file_genome = file_genome, file_gtf=file_gtf, file_mutations = file_mutations_2, file_protein=file_protein, threads=threads, outprefix=outprefix_2, datatype=datatype, protein_keyword=protein_keyword)
6262
pergeno_2.splitInputByChromosomes()
@@ -129,7 +129,7 @@ def runPerGenoVCF(
129129
print('perGeno_vcf finished!')
130130

131131

132-
description = '''PerGenoVCF, personal proteogenomic tools which outputs a new reference protein based on the variants data. VCF file as the variant input
132+
description = '''PrecisionProDB_vcf, personal proteogenomic tools which outputs a new reference protein based on the variants data. VCF file as the variant input
133133
'''
134134
if __name__ == '__main__':
135135
import argparse
@@ -140,7 +140,7 @@ def runPerGenoVCF(
140140
parser.add_argument('-p','--protein', help = 'protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', required=True)
141141
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
142142
parser.add_argument('-o', '--out', help='''output prefix. Five files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences, two variant files from vcf. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa, {out}.vcf2mutation_1/2.tsv. default "perGeno" ''', default="perGeno")
143-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
143+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
144144
parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
145145
parser.add_argument('-F', '--no_filter', help='default only keep variant with value "PASS" FILTER column of vcf file. if set, do not filter', action='store_true')
146146
parser.add_argument('-s', '--sample', help='sample name in the vcf to extract the variant information. default: None, extract the first sample', default=None)

src/downloadHuman.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -144,8 +144,8 @@ def download(datatype, workfolder='.'):
144144

145145

146146
description = '''
147-
download the latest human gene models from RefSeq, GENCODE or Ensembl to run perGeno
148-
If datatype is "Uniprot", Ensembl and Uniprot human sequences (UP000005640_9606, UP000005640_9606_additional) will be downloaded
147+
download the latest human gene models from RefSeq, GENCODE, Ensembl or UniProt to run PrecisionProDB.
148+
If datatype is "Uniprot", Ensembl and UniProt human sequences (UP000005640_9606, UP000005640_9606_additional) will be downloaded
149149
'''
150150

151151
if __name__ == '__main__':

src/extractMutatedUniprot.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -96,8 +96,8 @@ def extractMutatedUniprot(files_uniprot, files_ref, files_alt, outprefix, length
9696

9797

9898
description = '''
99-
match uniprot proteins in files_uniprot with files_ref. To start a match, the min length of protein is length_min.
100-
output mutated uniprot proteins in files_alt.
99+
Match UniProt proteins in files_uniprot with files_ref.
100+
Output mutated proteins in files_alt.
101101
write three files, outprefix + '.uniprot_changed.tsv'/'.uniprot_changed.fa'/'.uniprot_all.fa'
102102
'''
103103

@@ -108,7 +108,7 @@ def extractMutatedUniprot(files_uniprot, files_ref, files_alt, outprefix, length
108108
parser.add_argument('-r', '--files_ref', help='reference proteins to match with uniprot proteins. If more than one files, join by ","')
109109
parser.add_argument('-a', '--files_alt', help='altered reference proteins. If more than one files, join by ",". The order should be the same as files_ref')
110110
parser.add_argument('-o', '--outprefix', help='prefix for output files. default:"perGeno"', default='perGeno')
111-
parser.add_argument('-m', '--length_min', help='output folder for the downloaded files. default: "20"', default=20, type=int)
111+
parser.add_argument('-m', '--length_min', help='minumum length required when matching UniProt sequences with sequences in files_ref. default: "20"', default=20, type=int)
112112
f = parser.parse_args()
113113

114114
files_uniprot = f.files_uniprot

src/perChrom.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,7 @@ def translateCDSplusWithMut(self, transcript_id):
422422
tdf_m = self.getMut(mutations, strand)
423423

424424
# sometimes the AA sequences for example ENSP00000466819, is longer than the CDSplus. skip.
425-
if len(AA_seq)*3 > len(CDSplus):
425+
if len(AA_seq.strip('X'))*3 > len(CDSplus):
426426
print(transcript_id, 'input protein sequences cannot be translated from the CDS sequence in gtf annotation.')
427427

428428
if strand == '-':
@@ -695,7 +695,7 @@ def run_perChrom(self):
695695
fout.close()
696696

697697

698-
description = '''output a new reference protein set by with the variants data
698+
description = '''output a new reference protein set by with the variants data for each chromosome. The input files were generated by PrecisionProDB_core.
699699
'''
700700

701701
if __name__ == '__main__':
@@ -708,7 +708,7 @@ def run_perChrom(self):
708708
parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
709709
parser.add_argument('-o', '--out', help='output prefix. two file will be output. One is the annotation for mutated transcripts, one is the protein sequences. {out}.aa_mutations.csv, {out}.mutated_protein.fa')
710710
parser.add_argument('-c', '--chromosome', help='''chromosome name/id, default="chr1" ''', default='chr1', type=str)
711-
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
711+
parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
712712

713713
f = parser.parse_args()
714714

0 commit comments

Comments
 (0)