update some annotations

ATPs · ATPs · commit 9531062acfce · 2020-12-20T15:09:55.000-05:00
diff --git a/src/PrecisionProDB.py b/src/PrecisionProDB.py
@@ -5,7 +5,7 @@
 import time
 
 description = '''
-PerGeno, personal proteogenomic tools which outputs a new reference protein based on the variants data. 
+PrecisionProDB, personal proteogenomic tools which outputs a new reference protein based on the variants data. 
 VCF/tsv file as the variant input
 If the variant file is in tsv format, at least four columns are required in the header: chr, pos, ref, alt. Try to convert the file to proper format if you have a bed or other file.
 '''
@@ -18,7 +18,7 @@
     parser.add_argument('-p','--protein', help = 'protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', default='')
     parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
     parser.add_argument('-o', '--out', help='''output prefix. Five files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences, two variant files from vcf. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa, {out}.vcf2mutation_1/2.tsv. default "perGeno" ''', default="perGeno")
-    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
+    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
     parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
     parser.add_argument('-F', '--no_filter', help='default only keep variant with value "PASS" FILTER column of vcf file. if set, do not filter', action='store_true')
     parser.add_argument('-s', '--sample', help='sample name in the vcf to extract the variant information. default: None, extract the first sample', default=None)
@@ -102,4 +102,4 @@
         extractMutatedUniprot.extractMutatedUniprot(files_uniprot=files_uniprot, files_ref=file_protein, files_alt=outprefix + '.pergeno.protein_all.fa', outprefix=outprefix, length_min = 20)
 
     
-    print('perGeno finished!, total seconds:', time.time() - time0)
+    print('PrecisionProDB finished!, total seconds:', time.time() - time0)
diff --git a/src/PrecisionProDB_core.py b/src/PrecisionProDB_core.py
@@ -308,6 +308,8 @@ def splitMutationByChromosome(self):
         for k,v in df_mutations.groupby('chr'):
             v = v.copy()
             chromosomes_mutation.append(k)
+            if k.startswith('chr'):
+                k = k[3:]
             if k in chromosomes_genome:
                 tf = os.path.join(tempfolder, k + '.mutation.tsv')
             elif 'chr' + k in chromosomes_genome:
@@ -316,8 +318,6 @@ def splitMutationByChromosome(self):
                 v['chr'] = v['chr'].apply(lambda x:'chr' + str(x))
             else:
                 print('chromosomes in mutation file is different from the genome. try to solve that. This is usually True if datatype is RefSeq')
-                if k.startswith('chr'):
-                    k = k[3:]
                 if k == 'M':
                     for e in chromosomes_genome_description:
                         e1, e2 = e.split(' ', maxsplit=1)
@@ -333,7 +333,7 @@ def splitMutationByChromosome(self):
                             print(f'    mutation chromosome change {k} to {k_new}')
                             break
                     else:
-                        print('chromosome in mutation file', k, 'cannot find corresponding chromosome in genome file. please check')
+                        print('chromosomes in mutation file', k, 'cannot find corresponding chromosome in genome file. please check')
                         k_new = k
                 tf = os.path.join(tempfolder, k_new + '.mutation.tsv')
 
@@ -580,11 +580,11 @@ def runPerChom(self):
         # clear temp folder
         shutil.rmtree(self.tempfolder)
 
-        print('perGeno finished!')
+        print('finished!')
 
 
 
-description = '''PerGeno, personal proteogenomic tools which outputs a new reference protein based on the variants data
+description = '''PrecisionProDB_core, personal proteogenomic tools which outputs a new reference protein based on the variants data
 '''
 
 if __name__ == '__main__':
@@ -596,7 +596,7 @@ def runPerChom(self):
     parser.add_argument('-p','--protein', help = 'protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', required=True)
     parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
     parser.add_argument('-o', '--out', help='''output prefix. Three files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa. default "perGeno" ''', default="perGeno")
-    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
+    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
     parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
     f = parser.parse_args()
     
diff --git a/src/PrecisionProDB_vcf.py b/src/PrecisionProDB_vcf.py
@@ -49,14 +49,14 @@ def runPerGenoVCF(
     file_mutations_2 = outprefix_vcf + '_2.tsv'
 
     # run perGeno for mutations_1
-    print('start running perGeno for first strand of the genome mutation file')
+    print('start running PrecisionProDB for first strand of the genome mutation file')
     outprefix_1 = outprefix + '_1'
     pergeno_1 = PerGeno(file_genome = file_genome, file_gtf=file_gtf, file_mutations = file_mutations_1, file_protein=file_protein, threads=threads, outprefix=outprefix_1, datatype=datatype, protein_keyword=protein_keyword)
     pergeno_1.splitInputByChromosomes()
     pergeno_1.runPerChom()
 
     # run perGeno for mutations_2
-    print('start running perGeno for second strand of the genome mutation file')
+    print('start running PrecisionProDB for second strand of the genome mutation file')
     outprefix_2 = outprefix + '_2'
     pergeno_2 = PerGeno(file_genome = file_genome, file_gtf=file_gtf, file_mutations = file_mutations_2, file_protein=file_protein, threads=threads, outprefix=outprefix_2, datatype=datatype, protein_keyword=protein_keyword)
     pergeno_2.splitInputByChromosomes()
@@ -129,7 +129,7 @@ def runPerGenoVCF(
     print('perGeno_vcf finished!')
 
 
-description = '''PerGenoVCF, personal proteogenomic tools which outputs a new reference protein based on the variants data. VCF file as the variant input
+description = '''PrecisionProDB_vcf, personal proteogenomic tools which outputs a new reference protein based on the variants data. VCF file as the variant input
 '''
 if __name__ == '__main__':
     import argparse
@@ -140,7 +140,7 @@ def runPerGenoVCF(
     parser.add_argument('-p','--protein', help = 'protein sequences in fasta format. It can be a gzip file. Only proteins in this file will be checked', required=True)
     parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
     parser.add_argument('-o', '--out', help='''output prefix. Five files will be saved, including the annotation for mutated transcripts, the mutated or all protein sequences, two variant files from vcf. {out}.pergeno.aa_mutations.csv, {out}.pergeno.protein_all.fa, {out}.protein_changed.fa, {out}.vcf2mutation_1/2.tsv. default "perGeno" ''', default="perGeno")
-    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
+    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
     parser.add_argument('-k','--protein_keyword', help='''field name in attribute column of gtf file to determine ids for proteins. default "auto", determine the protein_keyword based on datatype. "transcript_id" for GENCODE_GTF, "protein_id" for "RefSeq" and "Parent" for gtf and GENCODE_GFF3 ''', default='auto')
     parser.add_argument('-F', '--no_filter', help='default only keep variant with value "PASS" FILTER column of vcf file. if set, do not filter', action='store_true')
     parser.add_argument('-s', '--sample', help='sample name in the vcf to extract the variant information. default: None, extract the first sample', default=None)
diff --git a/src/downloadHuman.py b/src/downloadHuman.py
@@ -144,8 +144,8 @@ def download(datatype, workfolder='.'):
 
 
 description = '''
-download the latest human gene models from RefSeq, GENCODE or Ensembl to run perGeno
-If datatype is "Uniprot", Ensembl and Uniprot human sequences (UP000005640_9606, UP000005640_9606_additional) will be downloaded
+download the latest human gene models from RefSeq, GENCODE, Ensembl or UniProt to run PrecisionProDB.
+If datatype is "Uniprot", Ensembl and UniProt human sequences (UP000005640_9606, UP000005640_9606_additional) will be downloaded
 '''
 
 if __name__ == '__main__':
diff --git a/src/extractMutatedUniprot.py b/src/extractMutatedUniprot.py
@@ -96,8 +96,8 @@ def extractMutatedUniprot(files_uniprot, files_ref, files_alt, outprefix, length
 
 
 description = '''
-match uniprot proteins in files_uniprot with files_ref. To start a match, the min length of protein is length_min.
-output mutated uniprot proteins in files_alt.
+Match UniProt proteins in files_uniprot with files_ref.
+Output mutated proteins in files_alt.
 write three files, outprefix + '.uniprot_changed.tsv'/'.uniprot_changed.fa'/'.uniprot_all.fa'
 '''
 
@@ -108,7 +108,7 @@ def extractMutatedUniprot(files_uniprot, files_ref, files_alt, outprefix, length
     parser.add_argument('-r', '--files_ref', help='reference proteins to match with uniprot proteins. If more than one files, join by ","')
     parser.add_argument('-a', '--files_alt', help='altered reference proteins. If more than one files, join by ",". The order should be the same as files_ref')
     parser.add_argument('-o', '--outprefix', help='prefix for output files. default:"perGeno"', default='perGeno')
-    parser.add_argument('-m', '--length_min', help='output folder for the downloaded files. default: "20"', default=20, type=int)
+    parser.add_argument('-m', '--length_min', help='minumum length required when matching UniProt sequences with sequences in files_ref. default: "20"', default=20, type=int)
     f = parser.parse_args()
     
     files_uniprot = f.files_uniprot
diff --git a/src/perChrom.py b/src/perChrom.py
@@ -422,7 +422,7 @@ def translateCDSplusWithMut(self, transcript_id):
         tdf_m = self.getMut(mutations, strand)
         
         # sometimes the AA sequences for example ENSP00000466819, is longer than the CDSplus. skip.
-        if len(AA_seq)*3 > len(CDSplus):
+        if len(AA_seq.strip('X'))*3 > len(CDSplus):
             print(transcript_id, 'input protein sequences cannot be translated from the CDS sequence in gtf annotation.')
 
         if strand == '-':
@@ -695,7 +695,7 @@ def run_perChrom(self):
         fout.close()
 
 
-description = '''output a new reference protein set by with the variants data
+description = '''output a new reference protein set by with the variants data for each chromosome. The input files were generated by PrecisionProDB_core.
 '''
 
 if __name__ == '__main__':
@@ -708,7 +708,7 @@ def run_perChrom(self):
     parser.add_argument('-t', '--threads', help='number of threads/CPUs to run the program. default, use all CPUs available', type=int, default=os.cpu_count())
     parser.add_argument('-o', '--out', help='output prefix. two file will be output. One is the annotation for mutated transcripts, one is the protein sequences. {out}.aa_mutations.csv, {out}.mutated_protein.fa')
     parser.add_argument('-c', '--chromosome', help='''chromosome name/id, default="chr1" ''', default='chr1', type=str)
-    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". perGeno does not support Ensembl GFF3 ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
+    parser.add_argument('-a', '--datatype', help='''input datatype, could be GENCODE_GTF, GENCODE_GFF3, RefSeq, Ensembl_GTF or gtf. default "gtf". Ensembl_GFF3 is not supported. ''', default='gtf', type=str, choices=['GENCODE_GTF', 'GENCODE_GFF3','RefSeq','Ensembl_GTF','gtf'])
     
     f = parser.parse_args()