Skip to content

Commit c7b20af

Browse files
authored
Merge pull request #211 from pbashyal-nmdp/output_glstring_batchmode
Output glstring in batchmode
2 parents 31aa0e2 + aa80062 commit c7b20af

File tree

3 files changed

+105
-49
lines changed

3 files changed

+105
-49
lines changed

extras/reduce_conf.json

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,30 @@
1414
"r_dpb1_typ1",
1515
"r_dpb1_typ2"
1616
],
17-
"columns_to_reduce_in_csv": [
18-
"r_a_typ1",
19-
"r_a_typ2",
20-
"r_b_typ1",
21-
"r_b_typ2",
22-
"r_c_typ1",
23-
"r_c_typ2",
24-
"r_drb1_typ1",
25-
"r_drb1_typ2",
26-
"r_dpb1_typ1",
27-
"r_dpb1_typ2"
28-
],
17+
"locus_column_mapping": {
18+
"recipient": {
19+
"A": [
20+
"r_a_typ1",
21+
"r_a_typ2"
22+
],
23+
"B": [
24+
"r_b_typ1",
25+
"r_b_typ2"
26+
],
27+
"C": [
28+
"r_c_typ1",
29+
"r_c_typ2"
30+
],
31+
"drb1": [
32+
"r_drb1_typ1",
33+
"r_drb1_typ2"
34+
],
35+
"dqb1": [
36+
"r_dpb1_typ1",
37+
"r_dpb1_typ2"
38+
]
39+
}
40+
},
2941
"redux_type": "lgx",
3042
"reduce_serology": false,
3143
"reduce_v2": true,
@@ -36,10 +48,11 @@
3648
"reduce_XX": false,
3749
"reduce_MAC": true,
3850
"locus_in_allele_name": true,
39-
"keep_locus_in_allele_name": false,
51+
"keep_locus_in_allele_name": true,
4052
"output_file_format": "csv",
41-
"new_column_for_redux": false,
53+
"new_column_for_redux": true,
4254
"map_drb345_to_drbx": false,
4355
"apply_compression": "gzip",
56+
"generate_glstring": true,
4457
"verbose_log": true
4558
}

pyard/db.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@
2222
#
2323
import pathlib
2424
import sqlite3
25-
import sys
2625
from typing import Tuple, Dict, Set, List
2726

2827
from pyard.misc import get_imgt_db_versions

scripts/pyard-reduce-csv

Lines changed: 78 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -110,7 +110,7 @@ def reduce(allele, locus, column_name):
110110
return allele
111111
if "*" in allele:
112112
locus_allele = allele
113-
elif ard_config["locus_in_allele_name"]:
113+
elif ard_config.get("locus_in_allele_name"):
114114
locus_allele = allele
115115
else:
116116
locus_allele = f"{locus}*{allele}"
@@ -129,7 +129,7 @@ def reduce(allele, locus, column_name):
129129
return allele
130130
# print(f"reduced to '{reduced_allele}'")
131131
if reduced_allele:
132-
if ard_config["keep_locus_in_allele_name"]:
132+
if ard_config.get("keep_locus_in_allele_name"):
133133
allele = reduced_allele
134134
else:
135135
allele = remove_locus_name(reduced_allele)
@@ -139,26 +139,25 @@ def reduce(allele, locus, column_name):
139139
if verbose:
140140
print(f"\t{locus_allele} => {allele}")
141141
else:
142-
if ard_config["convert_v2_to_v3"]:
142+
if ard_config.get("convert_v2_to_v3"):
143143
if ard.is_v2(locus_allele):
144144
v3_allele = ard.v2_to_v3(locus_allele)
145-
if not ard_config["keep_locus_in_allele_name"]:
145+
if not ard_config.get("keep_locus_in_allele_name"):
146146
allele = remove_locus_name(v3_allele)
147147
else:
148148
allele = v3_allele
149149
if verbose:
150150
print(f"\t{locus_allele} => {allele}")
151-
elif ard_config["keep_locus_in_allele_name"]:
151+
elif ard_config.get("keep_locus_in_allele_name"):
152152
allele = locus_allele
153153

154154
return allele
155155

156156

157-
def clean_locus(allele: str, column_name: str = "Unknown") -> str:
157+
def clean_locus(allele: str, locus: str, column_name: str = "Unknown") -> str:
158158
if allele:
159159
# Remove all white spaces
160160
allele = white_space_regex.sub("", allele)
161-
locus = column_name.split("_")[1].upper()
162161
# If the allele comes in as an allele list, apply reduce to all alleles
163162
if "/" in allele:
164163
return "/".join(map(reduce, allele.split("/"), locus, column_name))
@@ -187,17 +186,29 @@ if __name__ == "__main__":
187186
dest="imgt_version",
188187
help="IPD-IMGT/HLA db to use for redux",
189188
)
189+
parser.add_argument(
190+
"-q",
191+
"--quiet",
192+
dest="quiet",
193+
action="store_true",
194+
default=False,
195+
help="Don't print verbose log",
196+
)
190197
args = parser.parse_args()
191198
config_filename = args.config
192199

193200
print("Using config file:", config_filename)
194201
with open(config_filename) as conf_file:
195202
ard_config = json.load(conf_file)
196203

197-
verbose = ard_config["verbose_log"]
204+
if not args.quiet:
205+
verbose = ard_config.get("verbose_log")
206+
else:
207+
verbose = False
208+
198209
white_space_regex = re.compile(r"\s+")
199210

200-
if ard_config["output_file_format"] == "xlsx":
211+
if ard_config.get("output_file_format") == "xlsx":
201212
try:
202213
import openpyxl
203214
except ImportError:
@@ -216,36 +227,48 @@ if __name__ == "__main__":
216227
# Read only the columns to be saved.
217228
# Header is the first row
218229
# Don't convert to NAs
219-
df = pd.read_csv(
220-
ard_config["in_csv_filename"],
221-
usecols=ard_config["columns_from_csv"],
222-
header=0,
223-
dtype=str,
224-
keep_default_na=False,
225-
)
230+
try:
231+
df = pd.read_csv(
232+
ard_config["in_csv_filename"],
233+
usecols=ard_config["columns_from_csv"],
234+
header=0,
235+
dtype=str,
236+
keep_default_na=False,
237+
)
238+
except FileNotFoundError as e:
239+
print(f"File not found {ard_config.get('in_csv_filename')}", file=sys.stderr)
240+
sys.exit(1)
226241

242+
reduce_prefix = "reduced_"
227243
failed_to_reduce_alleles = []
228-
# Reduce each of the specified columns
229-
for column in ard_config["columns_to_reduce_in_csv"]:
230-
if verbose:
231-
print(f"Column:{column} =>")
232-
if ard_config["new_column_for_redux"]:
233-
# insert a new column
234-
new_column_name = f"reduced_{column}"
235-
new_column_index = df.columns.get_loc(column) + 1
236-
# Apply clean_locus function to the column and insert as a new column
237-
df.insert(
238-
new_column_index,
239-
new_column_name,
240-
df[column].apply(clean_locus, column_name=column),
241-
)
242-
else:
243-
# Apply clean_locus function to the column and replace the column
244-
df[column] = df[column].apply(clean_locus, column_name=column)
244+
locus_column_mapping = ard_config["locus_column_mapping"]
245+
for subject in locus_column_mapping:
246+
for locus in locus_column_mapping[subject]:
247+
# Reduce each of the specified columns
248+
locus_columns = locus_column_mapping[subject][locus]
249+
for column in locus_columns:
250+
if verbose:
251+
print(f"Column:{column} =>")
252+
if ard_config.get("new_column_for_redux"):
253+
# insert a new column
254+
new_column_name = f"{reduce_prefix}{column}"
255+
new_column_index = df.columns.get_loc(column) + 1
256+
# Apply clean_locus function to the column and insert as a new column
257+
df.insert(
258+
new_column_index,
259+
new_column_name,
260+
df[column].apply(clean_locus, locus=locus, column_name=column),
261+
)
262+
locus_columns[locus_columns.index(column)] = new_column_name
263+
else:
264+
# Apply clean_locus function to the column and replace the column
265+
df[column] = df[column].apply(
266+
clean_locus, locus=locus, column_name=column
267+
)
245268

246269
# Map DRB3,DRB4,DRB5 to DRBX if specified
247270
# New columns DRBX_1 and DRBX_2 are created
248-
if ard_config["map_drb345_to_drbx"]:
271+
if ard_config.get("map_drb345_to_drbx"):
249272
drbx_loci = ["DRB3", "DRB4", "DRB5"]
250273
drbx_columns = [
251274
col_name for col_name in df.columns if col_name.split("_")[1] in drbx_loci
@@ -257,6 +280,27 @@ if __name__ == "__main__":
257280
)
258281
df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx)
259282

283+
if ard_config.get("generate_glstring"):
284+
for subject in locus_column_mapping:
285+
slug_columns = []
286+
for locus in locus_column_mapping[subject]:
287+
slug_column = locus + "_slug"
288+
slug_columns.append(slug_column)
289+
if len(locus_column_mapping[subject][locus]) > 1:
290+
df[slug_column] = (
291+
df[locus_column_mapping[subject][locus][0]]
292+
+ "+"
293+
+ df[locus_column_mapping[subject][locus][1]]
294+
)
295+
else:
296+
df[slug_column] = df[locus_column_mapping[subject][locus][0]]
297+
298+
df[subject + "_gl"] = df[slug_columns].agg("^".join, axis=1)
299+
df[subject + "_gl"] = df[subject + "_gl"].apply(
300+
lambda gl: gl.replace("^+", "")
301+
)
302+
df.drop(columns=slug_columns, inplace=True)
303+
260304
# Save as XLSX if specified
261305
if ard_config["output_file_format"] == "xlsx":
262306
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"

0 commit comments

Comments
 (0)