Merge pull request #6 from griffithlab/molecular_weight

evelyn-schmidt · web-flow · commit 0ddf47607609 · 2023-10-23T11:57:26.000-05:00
using ProteinAnaylsis from biopython to cacluate MW
diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,9 @@
 
 FROM python:3.8-slim-buster
 
+RUN ["apt-get", "update"]
+RUN ["apt-get", "install", "-y", "vim"]
+
 ADD scripts/get_FDA_thresholds.py /opt/scripts/get_FDA_thresholds.py
 ADD scripts/get_neoantigen_qc.py /opt/scripts/get_neoantigen_qc.py
 ADD scripts/requirements.txt /opt/scripts/requirements.txt
diff --git a/scripts/color_peptides51mer.py b/scripts/color_peptides51mer.py
@@ -17,6 +17,15 @@ def __init__(self, nucleotide, bold, color, underline, large, position, open_tag
         self.open_tag = open_tag
         self.close_tag = close_tag
 
+    def view(self):
+        print("Nucleotide: ", self.nucleotide)
+        print("Open Tag: ", self.open_tag)
+        print("Close Tag: ", self.close_tag)
+        print("Bold: ", self.bold)
+        print("Color: ",self.color)
+        print("Underline: ", self.underline)
+        print("Large: ", self.large)
+
 # ---- PARSE ARGUMENTS -------------------------------------------------------
 # Parses command line arguments
 # Enables user help
@@ -178,7 +187,7 @@ def set_span_tags(peptide_sequence):
             nucleotide.open_tag = True
 
             if inside_span:
-                nucleotide.close_tag = True # only if its isnide a span tag
+                nucleotide.close_tag = True # only if its inside a span tag
             else:
                 nucleotide.close_tag = False
 
@@ -202,9 +211,7 @@ def create_stylized_sequence(peptide_sequence):
             if nucleotide.close_tag:
                 new_string += '</span>'
                 
-
             if nucleotide.open_tag:
-
                 new_string += '<span style="'
                 if nucleotide.bold:
                     new_string += 'font-weight:bold;'
@@ -216,9 +223,6 @@ def create_stylized_sequence(peptide_sequence):
                      new_string += 'font-size:105%;'
                 new_string += '">'
                 new_string += nucleotide.nucleotide
-
-            if not nucleotide.large and not nucleotide.bold and not nucleotide.color and not nucleotide.underline:
-                new_string += nucleotide.nucleotide
         else:
             new_string += nucleotide.nucleotide
     return(new_string)   
@@ -273,7 +277,6 @@ def main():
     for index, row in peptides_51mer.iterrows():
 
         search_string = row['51mer ID']
-        print(search_string)
 
         #classII_sequence 
         classII_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Best Peptide Class II'].values[0]
@@ -306,10 +309,14 @@ def main():
 
             set_span_tags(peptide_sequence) # pass by reference
             
+            print(row['51mer ID'])
             new_string = create_stylized_sequence(peptide_sequence)
 
             next_td_tags[2].string = new_string
 
+            # Remove the tag_with_search_string from the BeautifulSoup tree
+            tag_with_search_string.decompose()
+
             modified_html = peptides_51mer_soup.prettify(formatter=None)
 
         else:
@@ -318,6 +325,13 @@ def main():
             print("ClassI: ", classI_peptide)
             print("ClassII: ", classII_peptide, "\n")
 
+        soup = BeautifulSoup(modified_html, 'html.parser')
+        tag_with_search_string = soup.select_one('th:-soup-contains("51mer ID")')
+        if tag_with_search_string:
+            tag_with_search_string.decompose()
+        # Now 'soup' contains the modified HTML with the tag removed
+        modified_html = soup.prettify(formatter=None)
+
     if args.WB:
         html_file_name = args.WB +  '/../manual_review/' + args.samp + ".Colored_Peptides.html" 
     else:
diff --git a/scripts/generate_reviews_files.py b/scripts/generate_reviews_files.py
@@ -2,6 +2,8 @@
 import csv
 import pandas as pd
 import sys
+from Bio.SeqUtils.ProtParam import ProteinAnalysis
+import re
 
 '''
 Write a script to create the files for the Case Final Reports
@@ -36,6 +38,29 @@ def parse_arguments():
 
     return(parser.parse_args())
 
+# Fucnction to break the pepetides ID on the . to extract gene and AA information
+def extract_info(value):
+    parts = value.split('.')
+    result = '.'.join([parts[2], parts[3], parts[4]])
+    return result
+
+# Function to rearrange string so that G518D looks like 518G/D
+def rearrange_string(s):
+    match = re.match(r'([A-Za-z]+)([\d-]+)([A-Za-z]*)', s)
+    if match:
+        letters_before = match.group(1)
+        numbers = match.group(2)
+        letters_after = match.group(3)
+                
+        return f"{numbers}{letters_before}/{letters_after}"
+    else:
+        return s
+    
+# Function to calculate molecular weight
+def calculate_molecular_weight(peptide):
+    analyzed_seq = ProteinAnalysis(peptide)
+    return analyzed_seq.molecular_weight()
+
 def main():
 
     # 1. ITB reivew
@@ -54,20 +79,41 @@ def main():
     reviewed_canidates = reviewed_canidates[reviewed_canidates.Evaluation != "Pending"]
     reviewed_canidates = reviewed_canidates[reviewed_canidates.Evaluation != "Reject"]
 
+    reviewed_canidates = reviewed_canidates.rename(columns={'Comments':'pVAC Review Comments'})
+    reviewed_canidates["Variant Called by CLE Pipeline"] = " "
+    reviewed_canidates["IGV Review Comments"] = " "
+
+
+    # create sorting ID that is gene and transcript to sort in the same order as peptide
+    reviewed_canidates['sorting id'] = reviewed_canidates['Gene']  + '.' + reviewed_canidates['Best Transcript']
+
+
     peptides = pd.read_csv(args.c, sep="\t")
     peptides =  peptides.drop(['cterm_7mer_gravy_score', 'cysteine_count', 'n_terminal_asparagine', 'asparagine_proline_bond_count', 
                                  'difficult_n_terminal_residue', 'c_terminal_cysteine', 'c_terminal_proline', 'max_7mer_gravy_score'], axis=1)
-    peptides = peptides.rename(columns={"id":"ID", "peptide_sequence":"CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE WITH FLANKING RESIDUES"})
     peptides["RESTRICTING HLA ALLELE"] = " "
-    peptides["CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE MW (CLIENT)"] = " "
+
+    peptides["CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE MW (CLIENT)"] = peptides["peptide_sequence"].apply(calculate_molecular_weight)
+
+    peptides = peptides.rename(columns={"id":"ID", "peptide_sequence":"CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE WITH FLANKING RESIDUES"})
     peptides["Comments"] = " "
     peptides["CANDIDATE NEOANTIGEN"] = peptides["ID"].apply(lambda x: '.'.join(x.split('.')[:3]))
     peptides["CANDIDATE NEOANTIGEN"] = args.samp + "." + peptides["CANDIDATE NEOANTIGEN"]
 
-
-
     peptides = peptides[["ID", "CANDIDATE NEOANTIGEN", "CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE WITH FLANKING RESIDUES", 
                            "RESTRICTING HLA ALLELE", "CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE MW (CLIENT)", "Comments"]]
+    
+
+    # creating a ID to sort reviewed canidates by the order of the 51mer
+    peptides['sorting id'] = peptides['ID'].apply(extract_info)
+
+    reviewed_canidates = reviewed_canidates.set_index('sorting id')
+    reviewed_canidates = reviewed_canidates.reindex(index=peptides['sorting id'])
+    reviewed_canidates = reviewed_canidates.reset_index()
+
+    reviewed_canidates = reviewed_canidates.drop(columns=['sorting id'])
+    peptides = peptides.drop(columns=['sorting id'])
+
 
     if args.WB:
         Peptide_file_name = args.WB +  '/../manual_review/' + args.samp + "_Peptides_51-mer.xlsx"
diff --git a/scripts/requirements.txt b/scripts/requirements.txt
@@ -2,4 +2,5 @@ argparse
 pandas
 et-xmlfile == 1.1.0 
 openpyxl == 3.1.2
-bs4 == 0.0.1
+bs4 == 0.0.1
+biopython
diff --git a/scripts/setup_review.py b/scripts/setup_review.py
@@ -1,6 +1,17 @@
 import argparse
 import subprocess
 
+def execute_script(script_path):
+    try:
+        print("Executing...", script_path)
+        subprocess.run(script_path, shell=True, check=True)
+        print("Successful.")
+        print()
+    except subprocess.CalledProcessError as e:
+        print(f"Warning: Script {script_path} did not execute correctly. Error: {e}")
+        print()
+
+
 # Define the command-line arguments
 parser = argparse.ArgumentParser(description='Sets up manuel review files')
 
@@ -30,15 +41,11 @@
 
 
 # Execute the first script
-print("Generating Review Files...")
-
-subprocess.run(command1, shell=True)
+execute_script(command1)
 
 # Execute the second script
-print("Coloring Peptide Sequeces...")
+execute_script(command2)
 
-subprocess.run(command2, shell=True)
 
 
-print("Scripts have been executed successfully.")