Skip to content

Commit 823abdc

Browse files
committed
Handle cases when there is no typing and when redux fails.
- If typing is '' return it as empty - If redux_gl fails, print it as problematic Produce a summary of all problematic typings with column name. ``` Summary ------- Failed reducing 'A*0.559722222' in column d_a_typ1 Failed reducing 'A*0.613194444' in column d_a_typ1 Failed reducing 'A*0.559722222' in column d_a_typ1 Failed reducing 'A*0.247916667' in column d_a_typ1 Failed reducing 'A*0.215972222' in column d_a_typ1 Failed reducing 'A*0.45994213' in column d_a_typ1 Failed reducing 'A*0.529166667' in column d_a_typ1 Failed reducing 'A*0.529166667' in column d_a_typ1 Failed reducing 'A*0.559722222' in column d_a_typ1 Failed reducing 'A*0.529166667' in column d_a_typ2 ``` Fixes #96
1 parent cb5a0ab commit 823abdc

File tree

1 file changed

+77
-62
lines changed

1 file changed

+77
-62
lines changed

scripts/pyard-reduce-csv

100644100755
Lines changed: 77 additions & 62 deletions
Original file line numberDiff line numberDiff line change
@@ -56,19 +56,6 @@ def is_P(allele: str) -> bool:
5656
return False
5757

5858

59-
def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
60-
if allele:
61-
# Remove all white spaces
62-
allele = white_space_regex.sub('', allele)
63-
locus = column_name.split('_')[1].upper()
64-
# If the allele comes in as an allele list, apply reduce to all alleles
65-
if '/' in allele:
66-
return "/".join(map(reduce, allele.split('/'), locus))
67-
else:
68-
return reduce(allele, locus)
69-
return allele
70-
71-
7259
def should_be_reduced(allele, locus_allele):
7360
if is_serology(allele):
7461
return ard_config["reduce_serology"]
@@ -96,8 +83,10 @@ def should_be_reduced(allele, locus_allele):
9683
return False
9784

9885

99-
def reduce(allele, locus):
86+
def reduce(allele, locus, column_name):
10087
# Does the allele name have the locus in it ?
88+
if allele == '':
89+
return allele
10190
if '*' in allele:
10291
locus_allele = allele
10392
elif ard_config["locus_in_allele_name"]:
@@ -108,7 +97,15 @@ def reduce(allele, locus):
10897
# Check the config if this allele should be reduced
10998
if should_be_reduced(allele, locus_allele):
11099
# print(f"reducing '{locus_allele}'")
111-
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
100+
try:
101+
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
102+
except RuntimeError as e:
103+
if verbose:
104+
print(e)
105+
message = f"Failed reducing '{locus_allele}' in column {column_name}"
106+
print(message)
107+
failure_summary_messages.append(message)
108+
return allele
112109
# print(f"reduced to '{reduced_allele}'")
113110
if reduced_allele:
114111
if ard_config["keep_locus_in_allele_name"]:
@@ -129,6 +126,19 @@ def reduce(allele, locus):
129126
return allele
130127

131128

129+
def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
130+
if allele:
131+
# Remove all white spaces
132+
allele = white_space_regex.sub('', allele)
133+
locus = column_name.split('_')[1].upper()
134+
# If the allele comes in as an allele list, apply reduce to all alleles
135+
if '/' in allele:
136+
return "/".join(map(reduce, allele.split('/'), locus, column_name))
137+
else:
138+
return reduce(allele, locus, column_name)
139+
return allele
140+
141+
132142
def create_drbx(row, locus_in_allele_name):
133143
return drbx.map_drbx(row.values, locus_in_allele_name)
134144

@@ -159,51 +169,56 @@ if __name__ == '__main__':
159169
print(" pip install openpyxl")
160170
sys.exit(1)
161171

162-
# Instantiate py-ard object with the latest
163-
ard = pyard.ARD(remove_invalid=False)
164-
165-
# Read the Input File
166-
# Read only the columns to be saved.
167-
# Header is the first row
168-
# Don't convert to NAs
169-
df = pd.read_csv(ard_config["in_csv_filename"],
170-
usecols=ard_config["columns_from_csv"],
171-
header=0, dtype=str,
172-
keep_default_na=False)
173-
174-
# Reduce each of the specified columns
175-
for column in ard_config["columns_to_reduce_in_csv"]:
176-
if verbose:
177-
print(f"Column:{column} =>")
178-
if ard_config["new_column_for_redux"]:
179-
# insert a new column
180-
new_column_name = f"reduced_{column}"
181-
new_column_index = df.columns.get_loc(column) + 1
182-
# Apply clean_locus function to the column and insert as a new column
183-
df.insert(new_column_index, new_column_name,
184-
df[column].apply(clean_locus, column_name=column))
172+
# Instantiate py-ard object with the latest
173+
ard = pyard.ARD(remove_invalid=False)
174+
175+
# Read the Input File
176+
# Read only the columns to be saved.
177+
# Header is the first row
178+
# Don't convert to NAs
179+
df = pd.read_csv(ard_config["in_csv_filename"],
180+
usecols=ard_config["columns_from_csv"],
181+
header=0, dtype=str,
182+
keep_default_na=False)
183+
184+
failure_summary_messages = []
185+
# Reduce each of the specified columns
186+
for column in ard_config["columns_to_reduce_in_csv"]:
187+
if verbose:
188+
print(f"Column:{column} =>")
189+
if ard_config["new_column_for_redux"]:
190+
# insert a new column
191+
new_column_name = f"reduced_{column}"
192+
new_column_index = df.columns.get_loc(column) + 1
193+
# Apply clean_locus function to the column and insert as a new column
194+
df.insert(new_column_index, new_column_name,
195+
df[column].apply(clean_locus, column_name=column))
196+
else:
197+
# Apply clean_locus function to the column and replace the column
198+
df[column] = df[column].apply(clean_locus, column_name=column)
199+
200+
# Map DRB3,DRB4,DRB5 to DRBX if specified
201+
# New columns DRBX_1 and DRBX_2 are created
202+
if ard_config['map_drb345_to_drbx']:
203+
drbx_loci = ['DRB3', 'DRB4', 'DRB5']
204+
drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci]
205+
if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2
206+
locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
207+
df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,))
208+
df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx)
209+
210+
# Save as XLSX if specified
211+
if ard_config["output_file_format"] == 'xlsx':
212+
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
213+
df.to_excel(out_file_name, index=False)
185214
else:
186-
# Apply clean_locus function to the column and replace the column
187-
df[column] = df[column].apply(clean_locus, column_name=column)
188-
189-
# Map DRB3,DRB4,DRB5 to DRBX if specified
190-
# New columns DRBX_1 and DRBX_2 are created
191-
if ard_config['map_drb345_to_drbx']:
192-
drbx_loci = ['DRB3', 'DRB4', 'DRB5']
193-
drbx_columns = [col_name for col_name in df.columns if col_name.split('_')[1] in drbx_loci]
194-
if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2
195-
locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
196-
df_drbx = df[drbx_columns].apply(create_drbx, axis=1, args=(locus_in_allele_name,))
197-
df['DRBX_1'], df['DRBX_2'] = zip(*df_drbx)
198-
199-
# Save as XLSX if specified
200-
if ard_config["output_file_format"] == 'xlsx':
201-
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
202-
df.to_excel(out_file_name, index=False)
203-
else:
204-
# Save as compressed CSV
205-
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
206-
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
207-
208-
# Done
209-
print(f"Saved result to file:{out_file_name}")
215+
# Save as compressed CSV
216+
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
217+
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
218+
219+
print("Summary")
220+
print("-------")
221+
for message in failure_summary_messages:
222+
print("\t", message)
223+
# Done
224+
print(f"Saved result to file:{out_file_name}")

0 commit comments

Comments
 (0)