@@ -56,19 +56,6 @@ def is_P(allele: str) -> bool:
5656 return False
5757
5858
59- def clean_locus (allele : str , column_name : str = 'Unknown' ) -> str :
60- if allele :
61- # Remove all white spaces
62- allele = white_space_regex .sub ('' , allele )
63- locus = column_name .split ('_' )[1 ].upper ()
64- # If the allele comes in as an allele list, apply reduce to all alleles
65- if '/' in allele :
66- return "/" .join (map (reduce , allele .split ('/' ), locus ))
67- else :
68- return reduce (allele , locus )
69- return allele
70-
71-
7259def should_be_reduced (allele , locus_allele ):
7360 if is_serology (allele ):
7461 return ard_config ["reduce_serology" ]
@@ -96,8 +83,10 @@ def should_be_reduced(allele, locus_allele):
9683 return False
9784
9885
99- def reduce (allele , locus ):
86+ def reduce (allele , locus , column_name ):
10087 # Does the allele name have the locus in it ?
88+ if allele == '' :
89+ return allele
10190 if '*' in allele :
10291 locus_allele = allele
10392 elif ard_config ["locus_in_allele_name" ]:
@@ -108,7 +97,15 @@ def reduce(allele, locus):
10897 # Check the config if this allele should be reduced
10998 if should_be_reduced (allele , locus_allele ):
11099 # print(f"reducing '{locus_allele}'")
111- reduced_allele = ard .redux_gl (locus_allele , ard_config ["redux_type" ])
100+ try :
101+ reduced_allele = ard .redux_gl (locus_allele , ard_config ["redux_type" ])
102+ except RuntimeError as e :
103+ if verbose :
104+ print (e )
105+ message = f"Failed reducing '{ locus_allele } ' in column { column_name } "
106+ print (message )
107+ failure_summary_messages .append (message )
108+ return allele
112109 # print(f"reduced to '{reduced_allele}'")
113110 if reduced_allele :
114111 if ard_config ["keep_locus_in_allele_name" ]:
@@ -129,6 +126,19 @@ def reduce(allele, locus):
129126 return allele
130127
131128
129+ def clean_locus (allele : str , column_name : str = 'Unknown' ) -> str :
130+ if allele :
131+ # Remove all white spaces
132+ allele = white_space_regex .sub ('' , allele )
133+ locus = column_name .split ('_' )[1 ].upper ()
134+ # If the allele comes in as an allele list, apply reduce to all alleles
135+ if '/' in allele :
136+ return "/" .join (map (reduce , allele .split ('/' ), locus , column_name ))
137+ else :
138+ return reduce (allele , locus , column_name )
139+ return allele
140+
141+
132142def create_drbx (row , locus_in_allele_name ):
133143 return drbx .map_drbx (row .values , locus_in_allele_name )
134144
@@ -159,51 +169,56 @@ if __name__ == '__main__':
159169 print (" pip install openpyxl" )
160170 sys .exit (1 )
161171
162- # Instantiate py-ard object with the latest
163- ard = pyard .ARD (remove_invalid = False )
164-
165- # Read the Input File
166- # Read only the columns to be saved.
167- # Header is the first row
168- # Don't convert to NAs
169- df = pd .read_csv (ard_config ["in_csv_filename" ],
170- usecols = ard_config ["columns_from_csv" ],
171- header = 0 , dtype = str ,
172- keep_default_na = False )
173-
174- # Reduce each of the specified columns
175- for column in ard_config ["columns_to_reduce_in_csv" ]:
176- if verbose :
177- print (f"Column:{ column } =>" )
178- if ard_config ["new_column_for_redux" ]:
179- # insert a new column
180- new_column_name = f"reduced_{ column } "
181- new_column_index = df .columns .get_loc (column ) + 1
182- # Apply clean_locus function to the column and insert as a new column
183- df .insert (new_column_index , new_column_name ,
184- df [column ].apply (clean_locus , column_name = column ))
172+ # Instantiate py-ard object with the latest
173+ ard = pyard .ARD (remove_invalid = False )
174+
175+ # Read the Input File
176+ # Read only the columns to be saved.
177+ # Header is the first row
178+ # Don't convert to NAs
179+ df = pd .read_csv (ard_config ["in_csv_filename" ],
180+ usecols = ard_config ["columns_from_csv" ],
181+ header = 0 , dtype = str ,
182+ keep_default_na = False )
183+
184+ failure_summary_messages = []
185+ # Reduce each of the specified columns
186+ for column in ard_config ["columns_to_reduce_in_csv" ]:
187+ if verbose :
188+ print (f"Column:{ column } =>" )
189+ if ard_config ["new_column_for_redux" ]:
190+ # insert a new column
191+ new_column_name = f"reduced_{ column } "
192+ new_column_index = df .columns .get_loc (column ) + 1
193+ # Apply clean_locus function to the column and insert as a new column
194+ df .insert (new_column_index , new_column_name ,
195+ df [column ].apply (clean_locus , column_name = column ))
196+ else :
197+ # Apply clean_locus function to the column and replace the column
198+ df [column ] = df [column ].apply (clean_locus , column_name = column )
199+
200+ # Map DRB3,DRB4,DRB5 to DRBX if specified
201+ # New columns DRBX_1 and DRBX_2 are created
202+ if ard_config ['map_drb345_to_drbx' ]:
203+ drbx_loci = ['DRB3' , 'DRB4' , 'DRB5' ]
204+ drbx_columns = [col_name for col_name in df .columns if col_name .split ('_' )[1 ] in drbx_loci ]
205+ if len (drbx_columns ) == len (drbx_loci ) * 2 : # For Type1/Type2
206+ locus_in_allele_name = ard_config ["keep_locus_in_allele_name" ]
207+ df_drbx = df [drbx_columns ].apply (create_drbx , axis = 1 , args = (locus_in_allele_name ,))
208+ df ['DRBX_1' ], df ['DRBX_2' ] = zip (* df_drbx )
209+
210+ # Save as XLSX if specified
211+ if ard_config ["output_file_format" ] == 'xlsx' :
212+ out_file_name = f"{ ard_config ['out_csv_filename' ]} .xlsx"
213+ df .to_excel (out_file_name , index = False )
185214 else :
186- # Apply clean_locus function to the column and replace the column
187- df [column ] = df [column ].apply (clean_locus , column_name = column )
188-
189- # Map DRB3,DRB4,DRB5 to DRBX if specified
190- # New columns DRBX_1 and DRBX_2 are created
191- if ard_config ['map_drb345_to_drbx' ]:
192- drbx_loci = ['DRB3' , 'DRB4' , 'DRB5' ]
193- drbx_columns = [col_name for col_name in df .columns if col_name .split ('_' )[1 ] in drbx_loci ]
194- if len (drbx_columns ) == len (drbx_loci ) * 2 : # For Type1/Type2
195- locus_in_allele_name = ard_config ["keep_locus_in_allele_name" ]
196- df_drbx = df [drbx_columns ].apply (create_drbx , axis = 1 , args = (locus_in_allele_name ,))
197- df ['DRBX_1' ], df ['DRBX_2' ] = zip (* df_drbx )
198-
199- # Save as XLSX if specified
200- if ard_config ["output_file_format" ] == 'xlsx' :
201- out_file_name = f"{ ard_config ['out_csv_filename' ]} .xlsx"
202- df .to_excel (out_file_name , index = False )
203- else :
204- # Save as compressed CSV
205- out_file_name = f"{ ard_config ['out_csv_filename' ] + '.gz' if ard_config ['apply_compression' ] else '' } "
206- df .to_csv (out_file_name , index = False , compression = ard_config ["apply_compression" ])
207-
208- # Done
209- print (f"Saved result to file:{ out_file_name } " )
215+ # Save as compressed CSV
216+ out_file_name = f"{ ard_config ['out_csv_filename' ] + '.gz' if ard_config ['apply_compression' ] else '' } "
217+ df .to_csv (out_file_name , index = False , compression = ard_config ["apply_compression" ])
218+
219+ print ("Summary" )
220+ print ("-------" )
221+ for message in failure_summary_messages :
222+ print ("\t " , message )
223+ # Done
224+ print (f"Saved result to file:{ out_file_name } " )
0 commit comments