Skip to content

Commit 245b794

Browse files
committed
Support reducing glstring column in a csv file
1 parent f7217f6 commit 245b794

File tree

6 files changed

+177
-76
lines changed

6 files changed

+177
-76
lines changed

extras/README.md

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -150,6 +150,20 @@ The column names corresponding to the loci will be reduced and must appear in th
150150
}
151151
```
152152

153+
### GL String Columns
154+
155+
Instead of providing single locus alleles per column with `locus_column_mapping`, a GL String describing the whole
156+
genotype can be provided per column. Use `glstring_columns` to provide a list of GL String columns to reduce.
157+
158+
```json
159+
"glstring_columns": [
160+
"donor_gl",
161+
"recip_gl"
162+
],
163+
```
164+
165+
Depending upon the data, only one of `locus_column_mapping` or `glstring_columns` needs to be provided.
166+
153167
### Redux Options
154168

155169
`redux_type` Reduction Type

extras/reduce_conf.json

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,8 @@
22
"in_csv_filename": "sample.csv",
33
"out_csv_filename": "clean_sample.csv",
44
"columns_from_csv": [
5-
"nmdp_id",
5+
"rid",
6+
"did",
67
"r_a_typ1",
78
"r_a_typ2",
89
"r_b_typ1",

extras/reduce_conf_glstring.json

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
{
2+
"in_csv_filename": "sample_glstring.csv",
3+
"out_csv_filename": "clean_sample_glstring.csv",
4+
"columns_from_csv": [
5+
"did",
6+
"rid",
7+
"donor_gl",
8+
"recip_gl"
9+
],
10+
"glstring_columns": [
11+
"donor_gl",
12+
"recip_gl"
13+
],
14+
"redux_type": "lgx",
15+
"redux_cache_size": 1000,
16+
"reduce_serology": false,
17+
"reduce_v2": true,
18+
"convert_v2_to_v3": false,
19+
"reduce_2field": true,
20+
"reduce_3field": true,
21+
"reduce_P": true,
22+
"reduce_XX": false,
23+
"reduce_MAC": true,
24+
"map_drb345_to_drbx": false,
25+
"locus_in_allele_name": true,
26+
"keep_locus_in_allele_name": true,
27+
"new_column_for_redux": true,
28+
"reduced_column_prefix": "reduced_",
29+
"generate_glstring": true,
30+
"output_file_format": "csv",
31+
"apply_compression": "gzip",
32+
"verbose_log": true
33+
}

extras/sample.csv

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2
2-
123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
3-
456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
4-
789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01
1+
rid,did,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2,d_a_typ1,d_a_typ2,d_b_typ1,d_b_typ2,d_c_typ1,d_c_typ2,d_drb1_typ1,d_drb1_typ2,d_dpb1_typ1,d_dpb1_typ2
2+
2110,123,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01,A*01:AB,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
3+
2111,456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
4+
2113,789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01

extras/sample_glstring.csv

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
rid,did,recip_gl,donor_gl
2+
123,456,A*02:GNF+A*03:XYZ^B*07:ABD+B*44:AWA,A*02:01:01+A*03:01:01^B*07:RVXR+B*44:XYAG
3+
789,345,A*01:TUS+A*24:02:01G^B*08:ARGR+B*08:ARGS,A*02:01:01+A*01:PXTD^B*51:01:01G+B*40:BWUP

scripts/pyard-reduce-csv

Lines changed: 121 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -32,13 +32,14 @@ import argparse
3232
import json
3333
import re
3434
import sys
35+
from urllib.error import HTTPError
3536

3637
import pandas as pd
3738

3839
import pyard
3940
from pyard.db import similar_alleles
4041
import pyard.drbx as drbx
41-
from pyard.exceptions import PyArdError
42+
from pyard.exceptions import PyArdError, InvalidTypingError
4243
from pyard.misc import get_data_dir, get_imgt_version, download_to_file
4344

4445

@@ -171,6 +172,91 @@ def create_drbx(row, locus_in_allele_name):
171172
return drbx.map_drbx(row.values, locus_in_allele_name)
172173

173174

175+
def reduce_locus_columns(df, ard_config, locus_column_mapping, verbose):
176+
reduce_prefix = ard_config.get("reduced_column_prefix", "reduced_")
177+
for subject in locus_column_mapping:
178+
for locus in locus_column_mapping[subject]:
179+
# Reduce each of the specified columns
180+
locus_columns = locus_column_mapping[subject][locus]
181+
for column in locus_columns:
182+
if verbose:
183+
print(f"Column:{column} =>")
184+
if ard_config.get("new_column_for_redux"):
185+
# insert a new column
186+
new_column_name = f"{reduce_prefix}{column}"
187+
new_column_index = df.columns.get_loc(column) + 1
188+
# Apply clean_locus function to the column and insert as a new column
189+
df.insert(
190+
new_column_index,
191+
new_column_name,
192+
df[column].apply(clean_locus, locus=locus, column_name=column),
193+
)
194+
locus_columns[locus_columns.index(column)] = new_column_name
195+
else:
196+
# Apply clean_locus function to the column and replace the column
197+
df[column] = df[column].apply(
198+
clean_locus, locus=locus, column_name=column
199+
)
200+
# Map DRB3,DRB4,DRB5 to DRBX if specified
201+
# New columns DRBX_1 and DRBX_2 are created
202+
if ard_config.get("map_drb345_to_drbx"):
203+
drbx_loci = ["DRB3", "DRB4", "DRB5"]
204+
drbx_columns = [
205+
col_name for col_name in df.columns if col_name.split("_")[1] in drbx_loci
206+
]
207+
if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2
208+
locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
209+
df_drbx = df[drbx_columns].apply(
210+
create_drbx, axis=1, args=(locus_in_allele_name,)
211+
)
212+
df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx)
213+
214+
if ard_config.get("generate_glstring"):
215+
for subject in locus_column_mapping:
216+
slug_columns = []
217+
for locus in locus_column_mapping[subject]:
218+
slug_column = locus + "_slug"
219+
slug_columns.append(slug_column)
220+
if len(locus_column_mapping[subject][locus]) > 1:
221+
df[slug_column] = (
222+
df[locus_column_mapping[subject][locus][0]]
223+
+ "+"
224+
+ df[locus_column_mapping[subject][locus][1]]
225+
)
226+
else:
227+
df[slug_column] = df[locus_column_mapping[subject][locus][0]]
228+
229+
df[subject + "_gl"] = df[slug_columns].agg("^".join, axis=1)
230+
df[subject + "_gl"] = df[subject + "_gl"].apply(
231+
lambda gl: gl.replace("^+", "")
232+
)
233+
df.drop(columns=slug_columns, inplace=True)
234+
235+
236+
def reduce_glstring(glstring: str) -> str:
237+
try:
238+
return ard.redux(glstring, ard_config["redux_type"])
239+
except InvalidTypingError as e:
240+
print(f"Error reducing {glstring} \n", e.message, file=sys.stderr)
241+
return "Failed"
242+
243+
244+
def reduce_glstring_columns(df, ard_config, glstring_columns):
245+
reduce_prefix = ard_config.get("reduced_column_prefix", "reduced_")
246+
for column in glstring_columns:
247+
if ard_config.get("new_column_for_redux"):
248+
# insert a new column
249+
new_column_name = f"{reduce_prefix}{column}"
250+
new_column_index = df.columns.get_loc(column) + 1
251+
# Apply clean_locus function to the column and insert as a new column
252+
df.insert(
253+
new_column_index, new_column_name, df[column].apply(reduce_glstring)
254+
)
255+
else:
256+
# Apply clean_locus function to the column and replace the column
257+
df[column] = df[column].apply(reduce_glstring)
258+
259+
174260
if __name__ == "__main__":
175261
# config is specified with a -c parameter
176262
parser = argparse.ArgumentParser()
@@ -207,15 +293,20 @@ if __name__ == "__main__":
207293
args = parser.parse_args()
208294

209295
if args.generate:
210-
config_url = "https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/reduce_conf.json"
211-
sample_config = "sample_reduce_conf.json"
212-
download_to_file(config_url, sample_config)
213-
print(f"Created {sample_config}")
214-
215-
sample_url = "https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/sample.csv"
216-
sample_csv = "sample.csv"
217-
download_to_file(sample_url, sample_csv)
218-
print(f"Created {sample_csv}")
296+
sample_files = [
297+
"reduce_conf.json",
298+
"sample.csv",
299+
"reduce_conf_glstring.json",
300+
"sample_glstring.csv",
301+
]
302+
for sample_file in sample_files:
303+
try:
304+
url = f"https://raw.githubusercontent.com/nmdp-bioinformatics/py-ard/master/extras/{sample_file}"
305+
sample_local_file = f"sample_{sample_file}"
306+
download_to_file(url, sample_local_file)
307+
print(f"Created {sample_local_file}")
308+
except HTTPError:
309+
print(f"Download failed for {sample_file}")
219310
sys.exit(0)
220311

221312
config_filename = args.config
@@ -248,8 +339,21 @@ if __name__ == "__main__":
248339
data_dir = get_data_dir(args.data_dir)
249340
imgt_version = get_imgt_version(args.imgt_version)
250341
max_cache_size = ard_config.get("redux_cache_size", pyard.DEFAULT_CACHE_SIZE)
342+
csv_redux_config = {
343+
"reduce_serology": ard_config.get("reduce_serology", True),
344+
"reduce_v2": ard_config.get("reduce_v2", True),
345+
"reduce_3field": ard_config.get("reduce_3field", True),
346+
"reduce_P": ard_config.get("reduce_P", True),
347+
"reduce_XX": ard_config.get("reduce_XX", True),
348+
"reduce_MAC": ard_config.get("reduce_MAC", True),
349+
"map_drb345_to_drbx": ard_config.get("map_drb345_to_drbx", True),
350+
"verbose_log": ard_config.get("verbose_log", True),
351+
}
251352
ard = pyard.init(
252-
imgt_version=imgt_version, data_dir=data_dir, cache_size=max_cache_size
353+
imgt_version=imgt_version,
354+
data_dir=data_dir,
355+
cache_size=max_cache_size,
356+
config=csv_redux_config,
253357
)
254358

255359
# Read the Input File
@@ -268,68 +372,14 @@ if __name__ == "__main__":
268372
print(f"File not found {ard_config.get('in_csv_filename')}", file=sys.stderr)
269373
sys.exit(1)
270374

271-
reduce_prefix = ard_config.get("reduced_column_prefix", "reduced_")
272-
273375
failed_to_reduce_alleles = []
274-
locus_column_mapping = ard_config["locus_column_mapping"]
275-
for subject in locus_column_mapping:
276-
for locus in locus_column_mapping[subject]:
277-
# Reduce each of the specified columns
278-
locus_columns = locus_column_mapping[subject][locus]
279-
for column in locus_columns:
280-
if verbose:
281-
print(f"Column:{column} =>")
282-
if ard_config.get("new_column_for_redux"):
283-
# insert a new column
284-
new_column_name = f"{reduce_prefix}{column}"
285-
new_column_index = df.columns.get_loc(column) + 1
286-
# Apply clean_locus function to the column and insert as a new column
287-
df.insert(
288-
new_column_index,
289-
new_column_name,
290-
df[column].apply(clean_locus, locus=locus, column_name=column),
291-
)
292-
locus_columns[locus_columns.index(column)] = new_column_name
293-
else:
294-
# Apply clean_locus function to the column and replace the column
295-
df[column] = df[column].apply(
296-
clean_locus, locus=locus, column_name=column
297-
)
298-
299-
# Map DRB3,DRB4,DRB5 to DRBX if specified
300-
# New columns DRBX_1 and DRBX_2 are created
301-
if ard_config.get("map_drb345_to_drbx"):
302-
drbx_loci = ["DRB3", "DRB4", "DRB5"]
303-
drbx_columns = [
304-
col_name for col_name in df.columns if col_name.split("_")[1] in drbx_loci
305-
]
306-
if len(drbx_columns) == len(drbx_loci) * 2: # For Type1/Type2
307-
locus_in_allele_name = ard_config["keep_locus_in_allele_name"]
308-
df_drbx = df[drbx_columns].apply(
309-
create_drbx, axis=1, args=(locus_in_allele_name,)
310-
)
311-
df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx)
312-
313-
if ard_config.get("generate_glstring"):
314-
for subject in locus_column_mapping:
315-
slug_columns = []
316-
for locus in locus_column_mapping[subject]:
317-
slug_column = locus + "_slug"
318-
slug_columns.append(slug_column)
319-
if len(locus_column_mapping[subject][locus]) > 1:
320-
df[slug_column] = (
321-
df[locus_column_mapping[subject][locus][0]]
322-
+ "+"
323-
+ df[locus_column_mapping[subject][locus][1]]
324-
)
325-
else:
326-
df[slug_column] = df[locus_column_mapping[subject][locus][0]]
376+
locus_column_mapping = ard_config.get("locus_column_mapping", None)
377+
if locus_column_mapping:
378+
reduce_locus_columns(df, ard_config, locus_column_mapping, verbose)
327379

328-
df[subject + "_gl"] = df[slug_columns].agg("^".join, axis=1)
329-
df[subject + "_gl"] = df[subject + "_gl"].apply(
330-
lambda gl: gl.replace("^+", "")
331-
)
332-
df.drop(columns=slug_columns, inplace=True)
380+
glstring_columns = ard_config.get("glstring_columns", None)
381+
if glstring_columns:
382+
reduce_glstring_columns(df, ard_config, glstring_columns)
333383

334384
# Save as XLSX if specified
335385
if ard_config["output_file_format"] == "xlsx":

0 commit comments

Comments
 (0)