@@ -154,11 +154,10 @@ def reduce(allele, locus, column_name):
154154 return allele
155155
156156
157- def clean_locus (allele : str , column_name : str = "Unknown" ) -> str :
157+ def clean_locus (allele : str , locus : str , column_name : str = "Unknown" ) -> str :
158158 if allele :
159159 # Remove all white spaces
160160 allele = white_space_regex .sub ("" , allele )
161- locus = column_name .split ("_" )[1 ].upper ()
162161 # If the allele comes in as an allele list, apply reduce to all alleles
163162 if "/" in allele :
164163 return "/" .join (map (reduce , allele .split ("/" ), locus , column_name ))
@@ -216,32 +215,49 @@ if __name__ == "__main__":
216215 # Read only the columns to be saved.
217216 # Header is the first row
218217 # Don't convert to NAs
219- df = pd .read_csv (
220- ard_config ["in_csv_filename" ],
221- usecols = ard_config ["columns_from_csv" ],
222- header = 0 ,
223- dtype = str ,
224- keep_default_na = False ,
225- )
218+ try :
219+ df = pd .read_csv (
220+ ard_config ["in_csv_filename" ],
221+ usecols = ard_config ["columns_from_csv" ],
222+ header = 0 ,
223+ dtype = str ,
224+ keep_default_na = False ,
225+ )
226+ except FileNotFoundError as e :
227+ print (f"File not found { ard_config ['in_csv_filename' ]} " , file = sys .stderr )
228+ sys .exit (1 )
226229
230+ reduce_prefix = "reduced_"
227231 failed_to_reduce_alleles = []
228- # Reduce each of the specified columns
229- for column in ard_config ["columns_to_reduce_in_csv" ]:
230- if verbose :
231- print (f"Column:{ column } =>" )
232- if ard_config ["new_column_for_redux" ]:
233- # insert a new column
234- new_column_name = f"reduced_{ column } "
235- new_column_index = df .columns .get_loc (column ) + 1
236- # Apply clean_locus function to the column and insert as a new column
237- df .insert (
238- new_column_index ,
239- new_column_name ,
240- df [column ].apply (clean_locus , column_name = column ),
241- )
242- else :
243- # Apply clean_locus function to the column and replace the column
244- df [column ] = df [column ].apply (clean_locus , column_name = column )
232+ reduced_column_mappings = {}
233+ locus_column_mapping = ard_config ["locus_column_mapping" ]
234+ for subject in locus_column_mapping :
235+ reduced_column_mappings [subject ] = {}
236+ for locus in locus_column_mapping [subject ]:
237+ if locus not in reduced_column_mappings [subject ]:
238+ reduced_column_mappings [subject ][locus ] = []
239+ # Reduce each of the specified columns
240+ locus_columns = locus_column_mapping [subject ][locus ]
241+ for column in locus_columns :
242+ if verbose :
243+ print (f"Column:{ column } =>" )
244+ if ard_config ["new_column_for_redux" ]:
245+ # insert a new column
246+ new_column_name = f"{ reduce_prefix } { column } "
247+ new_column_index = df .columns .get_loc (column ) + 1
248+ # Apply clean_locus function to the column and insert as a new column
249+ df .insert (
250+ new_column_index ,
251+ new_column_name ,
252+ df [column ].apply (clean_locus , locus = locus , column_name = column ),
253+ )
254+ reduced_column_mappings [subject ][locus ].append (new_column_name )
255+ else :
256+ # Apply clean_locus function to the column and replace the column
257+ df [column ] = df [column ].apply (
258+ clean_locus , locus = locus , column_name = column
259+ )
260+ reduced_column_mappings [subject ][locus ].append (column )
245261
246262 # Map DRB3,DRB4,DRB5 to DRBX if specified
247263 # New columns DRBX_1 and DRBX_2 are created
@@ -257,6 +273,19 @@ if __name__ == "__main__":
257273 )
258274 df ["DRBX_1" ], df ["DRBX_2" ] = zip (* df_drbx )
259275
276+ if ard_config ["generate_glstring" ]:
277+ for subject in reduced_column_mappings :
278+ for haplotype_num in range (2 ):
279+ hap1_columns = list (
280+ map (
281+ lambda x : reduced_column_mappings [subject ][x ][haplotype_num ],
282+ reduced_column_mappings [subject ].keys (),
283+ )
284+ )
285+ df [subject + f"_haplotype_{ (haplotype_num + 1 )} " ] = df [
286+ hap1_columns
287+ ].agg ("~" .join , axis = 1 )
288+
260289 # Save as XLSX if specified
261290 if ard_config ["output_file_format" ] == "xlsx" :
262291 out_file_name = f"{ ard_config ['out_csv_filename' ]} .xlsx"
0 commit comments