Order by peptide sheet, MW, add extra columns to review file, fix some formatting

evelyn-schmidt · evelyn-schmidt · commit 5faf97ad919f · 2023-10-23T10:41:18.000-05:00
diff --git a/Dockerfile b/Dockerfile
@@ -2,6 +2,9 @@
 
 FROM python:3.8-slim-buster
 
+RUN ["apt-get", "update"]
+RUN ["apt-get", "install", "-y", "vim"]
+
 ADD scripts/get_FDA_thresholds.py /opt/scripts/get_FDA_thresholds.py
 ADD scripts/get_neoantigen_qc.py /opt/scripts/get_neoantigen_qc.py
 ADD scripts/requirements.txt /opt/scripts/requirements.txt
diff --git a/scripts/color_peptides51mer.py b/scripts/color_peptides51mer.py
@@ -320,9 +320,12 @@ def main():
             print("ClassI: ", classI_peptide)
             print("ClassII: ", classII_peptide, "\n")
 
-        tag_with_search_string =  modified_html.find('th', string="51mer ID")
-        tag_with_search_string.decompose()
-
+        soup = BeautifulSoup(modified_html, 'html.parser')
+        tag_with_search_string = soup.select_one('th:-soup-contains("51mer ID")')
+        if tag_with_search_string:
+            tag_with_search_string.decompose()
+        # Now 'soup' contains the modified HTML with the tag removed
+        modified_html = soup.prettify(formatter=None)
 
     if args.WB:
         html_file_name = args.WB +  '/../manual_review/' + args.samp + ".Colored_Peptides.html" 
diff --git a/scripts/generate_reviews_files.py b/scripts/generate_reviews_files.py
@@ -3,6 +3,7 @@
 import pandas as pd
 import sys
 from Bio.SeqUtils.ProtParam import ProteinAnalysis
+import re
 
 '''
 Write a script to create the files for the Case Final Reports
@@ -37,6 +38,29 @@ def parse_arguments():
 
     return(parser.parse_args())
 
+# Fucnction to break the pepetides ID on the . to extract gene and AA information
+def extract_info(value):
+    parts = value.split('.')
+    result = '.'.join([parts[2], parts[3], parts[4]])
+    return result
+
+# Function to rearrange string so that G518D looks like 518G/D
+def rearrange_string(s):
+    match = re.match(r'([A-Za-z]+)([\d-]+)([A-Za-z]*)', s)
+    if match:
+        letters_before = match.group(1)
+        numbers = match.group(2)
+        letters_after = match.group(3)
+                
+        return f"{numbers}{letters_before}/{letters_after}"
+    else:
+        return s
+    
+# Function to calculate molecular weight
+def calculate_molecular_weight(peptide):
+    analyzed_seq = ProteinAnalysis(peptide)
+    return analyzed_seq.molecular_weight()
+
 def main():
 
     # 1. ITB reivew
@@ -54,35 +78,42 @@ def main():
     
     reviewed_canidates = reviewed_canidates[reviewed_canidates.Evaluation != "Pending"]
     reviewed_canidates = reviewed_canidates[reviewed_canidates.Evaluation != "Reject"]
-    # key for sorted the tables
-    reviewed_canidates["CANDIDATE NEOANTIGEN"] = args.samp + "." + "MT." + reviewed_canidates["Pos"] + "." + reviewed_canidates["Gene"]
+
+    reviewed_canidates = reviewed_canidates.rename(columns={'Comments':'pVAC Review Comments'})
+    reviewed_canidates["Variant Called by CLE Pipeline"] = " "
+    reviewed_canidates["IGV Review Comments"] = " "
+
+
+    # create sorting ID that is gene and transcript to sort in the same order as peptide
+    reviewed_canidates['sorting id'] = reviewed_canidates['Gene']  + '.' + reviewed_canidates['Best Transcript']
+
 
     peptides = pd.read_csv(args.c, sep="\t")
     peptides =  peptides.drop(['cterm_7mer_gravy_score', 'cysteine_count', 'n_terminal_asparagine', 'asparagine_proline_bond_count', 
                                  'difficult_n_terminal_residue', 'c_terminal_cysteine', 'c_terminal_proline', 'max_7mer_gravy_score'], axis=1)
     peptides["RESTRICTING HLA ALLELE"] = " "
 
-    # Define a function to calculate molecular weight
-    def calculate_molecular_weight(peptide):
-        analyzed_seq = ProteinAnalysis(peptide)
-        return analyzed_seq.molecular_weight()
-
     peptides["CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE MW (CLIENT)"] = peptides["peptide_sequence"].apply(calculate_molecular_weight)
 
     peptides = peptides.rename(columns={"id":"ID", "peptide_sequence":"CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE WITH FLANKING RESIDUES"})
     peptides["Comments"] = " "
     peptides["CANDIDATE NEOANTIGEN"] = peptides["ID"].apply(lambda x: '.'.join(x.split('.')[:3]))
     peptides["CANDIDATE NEOANTIGEN"] = args.samp + "." + peptides["CANDIDATE NEOANTIGEN"]
 
-
-
     peptides = peptides[["ID", "CANDIDATE NEOANTIGEN", "CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE WITH FLANKING RESIDUES", 
                            "RESTRICTING HLA ALLELE", "CANDIDATE NEOANTIGEN AMINO ACID SEQUENCE MW (CLIENT)", "Comments"]]
     
-    # Sort the reviewed canidates according to peptide
-    reviewed_canidates =  reviewed_canidates.set_index('Col2')
-    reviewed_canidates =  reviewed_canidates.reindex(index=peptides['CANDIDATE NEOANTIGEN'])
-    reviewed_canidates =  reviewed_canidates.reset_index()
+
+    # creating a ID to sort reviewed canidates by the order of the 51mer
+    peptides['sorting id'] = peptides['ID'].apply(extract_info)
+
+    reviewed_canidates = reviewed_canidates.set_index('sorting id')
+    reviewed_canidates = reviewed_canidates.reindex(index=peptides['sorting id'])
+    reviewed_canidates = reviewed_canidates.reset_index()
+
+    reviewed_canidates = reviewed_canidates.drop(columns=['sorting id'])
+    peptides = peptides.drop(columns=['sorting id'])
+
 
     if args.WB:
         Peptide_file_name = args.WB +  '/../manual_review/' + args.samp + "_Peptides_51-mer.xlsx"
diff --git a/scripts/setup_review.py b/scripts/setup_review.py
@@ -1,6 +1,17 @@
 import argparse
 import subprocess
 
+def execute_script(script_path):
+    try:
+        print("Executing...", script_path)
+        subprocess.run(script_path, shell=True, check=True)
+        print("Successful.")
+        print()
+    except subprocess.CalledProcessError as e:
+        print(f"Warning: Script {script_path} did not execute correctly. Error: {e}")
+        print()
+
+
 # Define the command-line arguments
 parser = argparse.ArgumentParser(description='Sets up manuel review files')
 
@@ -30,15 +41,11 @@
 
 
 # Execute the first script
-print("Generating Review Files...")
-
-subprocess.run(command1, shell=True)
+execute_script(command1)
 
 # Execute the second script
-print("Coloring Peptide Sequeces...")
+execute_script(command2)
 
-subprocess.run(command2, shell=True)
 
 
-print("Scripts have been executed successfully.")