Fix the adding of an extra C if there are cystines in classI or classII, also changed how sheets are joined to account for different classI and classII transcripts

evelyn-schmidt · evelyn-schmidt · commit ef92cce66216 · 2023-10-16T10:10:10.000-05:00
diff --git a/scripts/color_peptides51mer.py b/scripts/color_peptides51mer.py
@@ -205,31 +205,17 @@ def create_stylized_sequence(peptide_sequence):
 
             if nucleotide.open_tag:
 
-                if nucleotide.large: # we are assuming that a cystine is never in the classI and classIi
-                    new_string += '<span style="font-size:105%">'
-                    new_string += nucleotide.nucleotide
-
-                if nucleotide.bold and nucleotide.color and nucleotide.underline:
-                    new_string += '<span style="font-weight:bold;color:#ff0000;text-decoration:underline;">'
-                    new_string += nucleotide.nucleotide
-                elif nucleotide.bold and not nucleotide.color and not nucleotide.underline:
-                    new_string += '<span style="font-weight:bold;">'
-                    new_string += nucleotide.nucleotide
-                elif not nucleotide.bold and nucleotide.color and not nucleotide.underline:
-                    new_string += '<span style="color:#ff0000;">'
-                    new_string += nucleotide.nucleotide
-                elif not nucleotide.bold and not nucleotide.color and nucleotide.underline:
-                    new_string += '<span style="text-decoration:underline;">'
-                    new_string += nucleotide.nucleotide
-                elif nucleotide.bold and nucleotide.color and not nucleotide.underline:
-                    new_string += '<span style="font-weight:bold;color:#ff0000;">'
-                    new_string += nucleotide.nucleotide
-                elif not nucleotide.bold and nucleotide.color and nucleotide.underline:
-                    new_string += '<span style="color:#ff0000;text-decoration:underline;">'
-                    new_string += nucleotide.nucleotide
-                elif nucleotide.bold and not nucleotide.color and nucleotide.underline:
-                    new_string += '<span style="font-weight:bold;text-decoration:underline;">'
-                    new_string += nucleotide.nucleotide
+                new_string += '<span style="'
+                if nucleotide.bold:
+                    new_string += 'font-weight:bold;'
+                if nucleotide.color:
+                    new_string += 'color:#ff0000;'
+                if nucleotide.underline:
+                    new_string += 'text-decoration:underline;'
+                if nucleotide.large:
+                     new_string += 'font-size:105%;'
+                new_string += '">'
+                new_string += nucleotide.nucleotide
 
             if not nucleotide.large and not nucleotide.bold and not nucleotide.color and not nucleotide.underline:
                 new_string += nucleotide.nucleotide
@@ -241,56 +227,41 @@ def main():
     args = parse_arguments()
 
     # read in classI and class II
-    #peptides_51mer = pd.read_excel("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/10146-0021_Peptides_51-mer.xlsx")
-    #classI = pd.read_csv("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/classI.TWJF-10146-0021-Tumor_Lysate.all_epitopes.aggregated.tsv", sep="\t")
-    #classII = pd.read_csv("/Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/data_files/classII.TWJF-10146-0021-Tumor_Lysate.all_epitopes.aggregated.tsv", sep="\t")
-
     peptides_51mer = pd.read_excel(args.p)
     classI = pd.read_csv(args.classI, sep="\t")
     classII = pd.read_csv(args.classII, sep="\t")
 
     # Create a universal ID by editing the peptide 51mer ID
     peptides_51mer.rename(columns={'ID': 'full ID'}, inplace=True)
-    peptides_51mer['ID'] = peptides_51mer['full ID']
+    peptides_51mer['51mer ID'] = peptides_51mer['full ID']
 
-    peptides_51mer['ID'] = peptides_51mer['ID'].apply(lambda x: '.'.join(x.split('.')[1:]))  # Removing before first period, periods will be removed 
+    peptides_51mer['51mer ID'] = peptides_51mer['51mer ID'].apply(lambda x: '.'.join(x.split('.')[1:]))  # Removing before first period, periods will be removed 
     
-    peptides_51mer['ID'] = peptides_51mer['ID'].apply(lambda x: '.'.join(x.split('.')[1:]))  # Removing before second period
-    peptides_51mer['ID'] = peptides_51mer['ID'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.' + '.'.join(x.split('.')[4:]))
+    peptides_51mer['51mer ID'] = peptides_51mer['51mer ID'].apply(lambda x: '.'.join(x.split('.')[1:]))  # Removing before second period
+    peptides_51mer['51mer ID'] = peptides_51mer['51mer ID'].apply(lambda x: '.'.join(x.split('.')[:3]) + '.' + '.'.join(x.split('.')[4:]))
     
 
     for index, row in peptides_51mer.iterrows():
-        for i, char in enumerate(row['ID'][::-1]):
+        for i, char in enumerate(row['51mer ID'][::-1]):
             if char.isdigit():
-                peptides_51mer.at[index, 'ID'] = row['ID'][:-i]
+                peptides_51mer.at[index, '51mer ID'] = row['51mer ID'][:-i]
                 break
         else:
-            result = row['ID']
-
-    # create a key that is gene, transcript, AA change for CLASSI
-    classII['modified AA Change'] = classII['AA Change'] 
-
-    # Apply the function to the 'Value' column
-    classII['modified AA Change'] = classII['modified AA Change'].apply(rearrange_string)
+            result = row['51mer ID']
 
-    classII['ID'] = classII['Gene'] + '.' + classII['Best Transcript'] + '.' + classII['modified AA Change'] 
+    # create a dataframe that contains the classI and classII pepetide sequence
+    classI.rename(columns = {"Best Peptide":"Best Peptide Class I"}, inplace=True)
+    classII.rename(columns = {"Best Peptide":"Best Peptide Class II"}, inplace=True)
 
-    # create a key that is gene, transcript, AA change for CLASSI
+    # create a key that is gene, transcript, AA change for ClassI to join to the peptides order form
     classI['modified AA Change'] = classI['AA Change'] 
-
-    # Apply the function to the 'Value' column
     classI['modified AA Change'] = classI['modified AA Change'].apply(rearrange_string)
+    classI['51mer ID'] = classI['Gene'] + '.' + classI['Best Transcript'] + '.' + classI['modified AA Change'] 
 
-    classI['ID'] = classI['Gene'] + '.' + classI['Best Transcript'] + '.' + classI['modified AA Change'] 
-
-    # Merge the sequences from classI and classII with peptide 51mer
-    merged_peptide_51mer = pd.merge(peptides_51mer, classII[['ID', 'Best Peptide']], on='ID', how='left')
-
-    merged_peptide_51mer.rename(columns = {"Best Peptide":"Best Peptide Class II"}, inplace=True)
-
-    merged_peptide_51mer = pd.merge(merged_peptide_51mer, classI[['ID', 'Best Peptide', 'Pos']], on='ID', how='left')
+    class_sequences = pd.merge(classI[['ID', 'Best Peptide Class I', '51mer ID', 'Pos']], classII[['ID', 'Best Peptide Class II']], on='ID', how='left')
 
-    merged_peptide_51mer.rename(columns = {"Best Peptide":"Best Peptide Class I"}, inplace=True)
+    # Create a dataframe that has the classI and classII sequence
+    merged_peptide_51mer = pd.merge(peptides_51mer, class_sequences, on='51mer ID', how='left')
 
     # convert peptide 51mer to HTML
     peptides_51mer_html = peptides_51mer.to_html(index=False) # convert to html
@@ -301,16 +272,17 @@ def main():
 
     for index, row in peptides_51mer.iterrows():
 
-        search_string = row['full ID']
+        search_string = row['51mer ID']
+        print(search_string)
 
         #classII_sequence 
-        classII_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['full ID'] == search_string, 'Best Peptide Class II'].values[0]
+        classII_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Best Peptide Class II'].values[0]
         #classI_sequence 
-        classI_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['full ID'] == search_string, 'Best Peptide Class I'].values[0]
+        classI_peptide = merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Best Peptide Class I'].values[0]
         
         
         # mutant pepetide position ---
-        mutant_peptide_pos = str(merged_peptide_51mer.loc[merged_peptide_51mer['full ID'] == search_string, 'Pos'].values[0])
+        mutant_peptide_pos = str(merged_peptide_51mer.loc[merged_peptide_51mer['51mer ID'] == search_string, 'Pos'].values[0])
 
         # Find the tag containing the search string
         tag_with_search_string = peptides_51mer_soup.find('td', string=search_string)
@@ -330,7 +302,7 @@ def main():
             # actaully lets break class I and classII into two steps and handle the mutated nucleotide in class I function
             # it should be basically like at that position in the class I set 
             
-            set_underline(peptide_sequence, mutant_peptide_pos, row['full ID'])
+            set_underline(peptide_sequence, mutant_peptide_pos, row['51mer ID'])
 
             set_span_tags(peptide_sequence) # pass by reference
             
diff --git a/scripts/setup_review.py b/scripts/setup_review.py
@@ -25,8 +25,8 @@
 
 
 
-command1 = f"python /opt/scripts/generate_reviews_files.py -WB {args.WB} -a {args.a} -c {args.c} -samp {args.samp}"
-command2 = f"python /opt/scripts/color_peptides51mer.py -WB {args.WB} -p {args.WB}/../manual_review/{args.samp}_Peptides_51-mer.xlsx -classI {args.classI} -classII {args.classII} -samp {args.samp}"
+command1 = f"python /Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/generate_reviews_files.py -WB {args.WB} -a {args.a} -c {args.c} -samp {args.samp}"
+command2 = f"python /Volumes/mgriffit/Active/griffithlab/gc2596/e.schmidt/neoag_vaccine_scripts/scripts/color_peptides51mer.py -WB {args.WB} -p {args.WB}/../manual_review/{args.samp}_Peptides_51-mer.xlsx -classI {args.classI} -classII {args.classII} -samp {args.samp}"
 
 
 # Execute the first script