@@ -32,13 +32,14 @@ import argparse
3232import json
3333import re
3434import sys
35+ from urllib .error import HTTPError
3536
3637import pandas as pd
3738
3839import pyard
3940from pyard .db import similar_alleles
4041import pyard .drbx as drbx
41- from pyard .exceptions import PyArdError
42+ from pyard .exceptions import PyArdError , InvalidTypingError
4243from pyard .misc import get_data_dir , get_imgt_version , download_to_file
4344
4445
@@ -171,6 +172,91 @@ def create_drbx(row, locus_in_allele_name):
171172 return drbx .map_drbx (row .values , locus_in_allele_name )
172173
173174
175+ def reduce_locus_columns (df , ard_config , locus_column_mapping , verbose ):
176+ reduce_prefix = ard_config .get ("reduced_column_prefix" , "reduced_" )
177+ for subject in locus_column_mapping :
178+ for locus in locus_column_mapping [subject ]:
179+ # Reduce each of the specified columns
180+ locus_columns = locus_column_mapping [subject ][locus ]
181+ for column in locus_columns :
182+ if verbose :
183+ print (f"Column:{ column } =>" )
184+ if ard_config .get ("new_column_for_redux" ):
185+ # insert a new column
186+ new_column_name = f"{ reduce_prefix } { column } "
187+ new_column_index = df .columns .get_loc (column ) + 1
188+ # Apply clean_locus function to the column and insert as a new column
189+ df .insert (
190+ new_column_index ,
191+ new_column_name ,
192+ df [column ].apply (clean_locus , locus = locus , column_name = column ),
193+ )
194+ locus_columns [locus_columns .index (column )] = new_column_name
195+ else :
196+ # Apply clean_locus function to the column and replace the column
197+ df [column ] = df [column ].apply (
198+ clean_locus , locus = locus , column_name = column
199+ )
200+ # Map DRB3,DRB4,DRB5 to DRBX if specified
201+ # New columns DRBX_1 and DRBX_2 are created
202+ if ard_config .get ("map_drb345_to_drbx" ):
203+ drbx_loci = ["DRB3" , "DRB4" , "DRB5" ]
204+ drbx_columns = [
205+ col_name for col_name in df .columns if col_name .split ("_" )[1 ] in drbx_loci
206+ ]
207+ if len (drbx_columns ) == len (drbx_loci ) * 2 : # For Type1/Type2
208+ locus_in_allele_name = ard_config ["keep_locus_in_allele_name" ]
209+ df_drbx = df [drbx_columns ].apply (
210+ create_drbx , axis = 1 , args = (locus_in_allele_name ,)
211+ )
212+ df ["DRBX_1" ], df ["DRBX_2" ] = zip (* df_drbx )
213+
214+ if ard_config .get ("generate_glstring" ):
215+ for subject in locus_column_mapping :
216+ slug_columns = []
217+ for locus in locus_column_mapping [subject ]:
218+ slug_column = locus + "_slug"
219+ slug_columns .append (slug_column )
220+ if len (locus_column_mapping [subject ][locus ]) > 1 :
221+ df [slug_column ] = (
222+ df [locus_column_mapping [subject ][locus ][0 ]]
223+ + "+"
224+ + df [locus_column_mapping [subject ][locus ][1 ]]
225+ )
226+ else :
227+ df [slug_column ] = df [locus_column_mapping [subject ][locus ][0 ]]
228+
229+ df [subject + "_gl" ] = df [slug_columns ].agg ("^" .join , axis = 1 )
230+ df [subject + "_gl" ] = df [subject + "_gl" ].apply (
231+ lambda gl : gl .replace ("^+" , "" )
232+ )
233+ df .drop (columns = slug_columns , inplace = True )
234+
235+
236+ def reduce_glstring (glstring : str ) -> str :
237+ try :
238+ return ard .redux (glstring , ard_config ["redux_type" ])
239+ except InvalidTypingError as e :
240+ print (f"Error reducing { glstring } \n " , e .message , file = sys .stderr )
241+ return "Failed"
242+
243+
244+ def reduce_glstring_columns (df , ard_config , glstring_columns ):
245+ reduce_prefix = ard_config .get ("reduced_column_prefix" , "reduced_" )
246+ for column in glstring_columns :
247+ if ard_config .get ("new_column_for_redux" ):
248+ # insert a new column
249+ new_column_name = f"{ reduce_prefix } { column } "
250+ new_column_index = df .columns .get_loc (column ) + 1
251+ # Apply clean_locus function to the column and insert as a new column
252+ df .insert (
253+ new_column_index , new_column_name , df [column ].apply (reduce_glstring )
254+ )
255+ else :
256+ # Apply clean_locus function to the column and replace the column
257+ df [column ] = df [column ].apply (reduce_glstring )
258+
259+
174260if __name__ == "__main__" :
175261 # config is specified with a -c parameter
176262 parser = argparse .ArgumentParser ()
@@ -207,15 +293,20 @@ if __name__ == "__main__":
207293 args = parser .parse_args ()
208294
209295 if args .generate :
210- config_url = "https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/reduce_conf.json"
211- sample_config = "sample_reduce_conf.json"
212- download_to_file (config_url , sample_config )
213- print (f"Created { sample_config } " )
214-
215- sample_url = "https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/sample.csv"
216- sample_csv = "sample.csv"
217- download_to_file (sample_url , sample_csv )
218- print (f"Created { sample_csv } " )
296+ sample_files = [
297+ "reduce_conf.json" ,
298+ "sample.csv" ,
299+ "reduce_conf_glstring.json" ,
300+ "sample_glstring.csv" ,
301+ ]
302+ for sample_file in sample_files :
303+ try :
304+ url = f"https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/{ sample_file } "
305+ sample_local_file = f"sample_{ sample_file } "
306+ download_to_file (url , sample_local_file )
307+ print (f"Created { sample_local_file } " )
308+ except HTTPError :
309+ print (f"Download failed for { sample_file } " )
219310 sys .exit (0 )
220311
221312 config_filename = args .config
@@ -248,8 +339,21 @@ if __name__ == "__main__":
248339 data_dir = get_data_dir (args .data_dir )
249340 imgt_version = get_imgt_version (args .imgt_version )
250341 max_cache_size = ard_config .get ("redux_cache_size" , pyard .DEFAULT_CACHE_SIZE )
342+ csv_redux_config = {
343+ "reduce_serology" : ard_config .get ("reduce_serology" , True ),
344+ "reduce_v2" : ard_config .get ("reduce_v2" , True ),
345+ "reduce_3field" : ard_config .get ("reduce_3field" , True ),
346+ "reduce_P" : ard_config .get ("reduce_P" , True ),
347+ "reduce_XX" : ard_config .get ("reduce_XX" , True ),
348+ "reduce_MAC" : ard_config .get ("reduce_MAC" , True ),
349+ "map_drb345_to_drbx" : ard_config .get ("map_drb345_to_drbx" , True ),
350+ "verbose_log" : ard_config .get ("verbose_log" , True ),
351+ }
251352 ard = pyard .init (
252- imgt_version = imgt_version , data_dir = data_dir , cache_size = max_cache_size
353+ imgt_version = imgt_version ,
354+ data_dir = data_dir ,
355+ cache_size = max_cache_size ,
356+ config = csv_redux_config ,
253357 )
254358
255359 # Read the Input File
@@ -268,68 +372,14 @@ if __name__ == "__main__":
268372 print (f"File not found { ard_config .get ('in_csv_filename' )} " , file = sys .stderr )
269373 sys .exit (1 )
270374
271- reduce_prefix = ard_config .get ("reduced_column_prefix" , "reduced_" )
272-
273375 failed_to_reduce_alleles = []
274- locus_column_mapping = ard_config ["locus_column_mapping" ]
275- for subject in locus_column_mapping :
276- for locus in locus_column_mapping [subject ]:
277- # Reduce each of the specified columns
278- locus_columns = locus_column_mapping [subject ][locus ]
279- for column in locus_columns :
280- if verbose :
281- print (f"Column:{ column } =>" )
282- if ard_config .get ("new_column_for_redux" ):
283- # insert a new column
284- new_column_name = f"{ reduce_prefix } { column } "
285- new_column_index = df .columns .get_loc (column ) + 1
286- # Apply clean_locus function to the column and insert as a new column
287- df .insert (
288- new_column_index ,
289- new_column_name ,
290- df [column ].apply (clean_locus , locus = locus , column_name = column ),
291- )
292- locus_columns [locus_columns .index (column )] = new_column_name
293- else :
294- # Apply clean_locus function to the column and replace the column
295- df [column ] = df [column ].apply (
296- clean_locus , locus = locus , column_name = column
297- )
298-
299- # Map DRB3,DRB4,DRB5 to DRBX if specified
300- # New columns DRBX_1 and DRBX_2 are created
301- if ard_config .get ("map_drb345_to_drbx" ):
302- drbx_loci = ["DRB3" , "DRB4" , "DRB5" ]
303- drbx_columns = [
304- col_name for col_name in df .columns if col_name .split ("_" )[1 ] in drbx_loci
305- ]
306- if len (drbx_columns ) == len (drbx_loci ) * 2 : # For Type1/Type2
307- locus_in_allele_name = ard_config ["keep_locus_in_allele_name" ]
308- df_drbx = df [drbx_columns ].apply (
309- create_drbx , axis = 1 , args = (locus_in_allele_name ,)
310- )
311- df ["DRBX_1" ], df ["DRBX_2" ] = zip (* df_drbx )
312-
313- if ard_config .get ("generate_glstring" ):
314- for subject in locus_column_mapping :
315- slug_columns = []
316- for locus in locus_column_mapping [subject ]:
317- slug_column = locus + "_slug"
318- slug_columns .append (slug_column )
319- if len (locus_column_mapping [subject ][locus ]) > 1 :
320- df [slug_column ] = (
321- df [locus_column_mapping [subject ][locus ][0 ]]
322- + "+"
323- + df [locus_column_mapping [subject ][locus ][1 ]]
324- )
325- else :
326- df [slug_column ] = df [locus_column_mapping [subject ][locus ][0 ]]
376+ locus_column_mapping = ard_config .get ("locus_column_mapping" , None )
377+ if locus_column_mapping :
378+ reduce_locus_columns (df , ard_config , locus_column_mapping , verbose )
327379
328- df [subject + "_gl" ] = df [slug_columns ].agg ("^" .join , axis = 1 )
329- df [subject + "_gl" ] = df [subject + "_gl" ].apply (
330- lambda gl : gl .replace ("^+" , "" )
331- )
332- df .drop (columns = slug_columns , inplace = True )
380+ glstring_columns = ard_config .get ("glstring_columns" , None )
381+ if glstring_columns :
382+ reduce_glstring_columns (df , ard_config , glstring_columns )
333383
334384 # Save as XLSX if specified
335385 if ard_config ["output_file_format" ] == "xlsx" :
0 commit comments