Skip to content

Commit 3bc6962

Browse files
added a function to generate review files to make sure the id columns are unique and also fixed some spelling errors
1 parent c4c223b commit 3bc6962

File tree

1 file changed

+39
-25
lines changed

1 file changed

+39
-25
lines changed

scripts/generate_reviews_files.py

Lines changed: 39 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,6 @@
99
Write a script to create the files for the Case Final Reports
1010
- Sample Peptides 51-mer
1111
- SAMPLE.Annotated.Neoantigen_Candidates.xlsx
12-
13-
Maybe the Sample Genomics Review Report with everything highlighted in yellow
14-
15-
16-
Use:
17-
python3 generate_reviews_files.py -a /Volumes/gillandersw/Active/Project_0001_Clinical_Trials/CTEP/analysis/TWJF-10146-MO011-0021/itb-review-files/10146-0021.Annotated.Neoantigen_Candidates.xlsx -c /Volumes/gillandersw/Active/Project_0001_Clinical_Trials/CTEP/analysis/TWJF-10146-MO011-0021/generate_protein_fasta/candidates/annotated_filtered.vcf-pass-51mer.fa.manufacturability.tsv -samp 10146-0021
1812
'''
1913

2014
# ---- PARSE ARGUMENTS -------------------------------------------------------
@@ -56,37 +50,55 @@ def rearrange_string(s):
5650
else:
5751
return s
5852

59-
# Function to calculate molecular weight
53+
# Function to calculate molecular weight---------------------------------------
6054
def calculate_molecular_weight(peptide):
6155
analyzed_seq = ProteinAnalysis(peptide)
6256
return analyzed_seq.molecular_weight()
6357

64-
def main():
58+
# Function to make id column unique -------------------------------------------
59+
def make_column_unique(df, column_name):
60+
seen_values = set()
61+
new_values = []
62+
63+
for value in df[column_name]:
64+
if value in seen_values:
65+
suffix = 1
66+
while f"{value}.{suffix}" in seen_values:
67+
suffix += 1
68+
unique_value = f"{value}.{suffix}"
69+
else:
70+
unique_value = value
6571

66-
# 1. ITB reivew
67-
# 2. Generate protein Fasta
72+
seen_values.add(unique_value)
73+
new_values.append(unique_value)
6874

75+
df[column_name] = new_values
76+
return df
77+
78+
79+
def main():
6980

7081
args = parse_arguments()
7182

72-
reviewed_canidates = pd.read_excel(args.a)
83+
reviewed_candidates = pd.read_excel(args.a)
7384

7485

75-
reviewed_canidates.columns = reviewed_canidates.iloc[0]
76-
reviewed_canidates = reviewed_canidates[1:] # there is a extra row before the col name row
77-
reviewed_canidates = reviewed_canidates.reset_index(drop=True) # Reset the index of the dataframe
86+
reviewed_candidates.columns = reviewed_candidates.iloc[0]
87+
reviewed_candidates = reviewed_candidates[1:] # there is a extra row before the col name row
88+
reviewed_candidates = reviewed_candidates.reset_index(drop=True) # Reset the index of the dataframe
7889

79-
reviewed_canidates = reviewed_canidates[reviewed_canidates.Evaluation != "Pending"]
80-
reviewed_canidates = reviewed_canidates[reviewed_canidates.Evaluation != "Reject"]
90+
reviewed_candidates = reviewed_candidates[reviewed_candidates.Evaluation != "Pending"]
91+
reviewed_candidates = reviewed_candidates[reviewed_candidates.Evaluation != "Reject"]
8192

82-
reviewed_canidates = reviewed_canidates.rename(columns={'Comments':'pVAC Review Comments'})
83-
reviewed_canidates["Variant Called by CLE Pipeline"] = " "
84-
reviewed_canidates["IGV Review Comments"] = " "
93+
reviewed_candidates = reviewed_candidates.rename(columns={'Comments':'pVAC Review Comments'})
94+
reviewed_candidates["Variant Called by CLE Pipeline"] = " "
95+
reviewed_candidates["IGV Review Comments"] = " "
8596

8697

8798
# create sorting ID that is gene and transcript to sort in the same order as peptide
88-
reviewed_canidates['sorting id'] = reviewed_canidates['Gene'] + '.' + reviewed_canidates['Best Transcript']
89-
99+
reviewed_candidates['sorting id'] = reviewed_candidates['Gene'] + '.' + reviewed_candidates['Best Transcript']
100+
# make sure the sorting id column is unique
101+
reviewed_canidates = make_column_unique(reviewed_candidates, 'sorting id')
90102

91103
peptides = pd.read_csv(args.c, sep="\t")
92104
peptides = peptides.drop(['cterm_7mer_gravy_score', 'cysteine_count', 'n_terminal_asparagine', 'asparagine_proline_bond_count',
@@ -106,12 +118,14 @@ def main():
106118

107119
# creating a ID to sort reviewed canidates by the order of the 51mer
108120
peptides['sorting id'] = peptides['ID'].apply(extract_info)
121+
# make sure every sorting id is unique
122+
peptides = make_column_unique(peptides, 'sorting id')
109123

110-
reviewed_canidates = reviewed_canidates.set_index('sorting id')
111-
reviewed_canidates = reviewed_canidates.reindex(index=peptides['sorting id'])
112-
reviewed_canidates = reviewed_canidates.reset_index()
124+
reviewed_candidates = reviewed_candidates.set_index('sorting id')
125+
reviewed_candidates = reviewed_candidates.reindex(index=peptides['sorting id'])
126+
reviewed_candidates = reviewed_candidates.reset_index()
113127

114-
reviewed_canidates = reviewed_canidates.drop(columns=['sorting id'])
128+
reviewed_candidates = reviewed_candidates.drop(columns=['sorting id'])
115129
peptides = peptides.drop(columns=['sorting id'])
116130

117131

0 commit comments

Comments
 (0)