Skip to content

Commit 086b40b

Browse files
committed
Generate GL String in Batch Mode
- Add `generate_glstring` option to `pyard-reduce-csv`
1 parent d5a0767 commit 086b40b

File tree

2 files changed

+103
-39
lines changed

2 files changed

+103
-39
lines changed

extras/reduce_conf.json

Lines changed: 48 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -14,18 +14,52 @@
1414
"r_dpb1_typ1",
1515
"r_dpb1_typ2"
1616
],
17-
"columns_to_reduce_in_csv": [
18-
"r_a_typ1",
19-
"r_a_typ2",
20-
"r_b_typ1",
21-
"r_b_typ2",
22-
"r_c_typ1",
23-
"r_c_typ2",
24-
"r_drb1_typ1",
25-
"r_drb1_typ2",
26-
"r_dpb1_typ1",
27-
"r_dpb1_typ2"
28-
],
17+
"locus_column_mapping": {
18+
"recipient": {
19+
"A": [
20+
"r_a_typ1",
21+
"r_a_typ2"
22+
],
23+
"B": [
24+
"r_b_typ1",
25+
"r_b_typ2"
26+
],
27+
"C": [
28+
"r_c_typ1",
29+
"r_c_typ2"
30+
],
31+
"drb1": [
32+
"r_drb1_typ1",
33+
"r_drb1_typ2"
34+
],
35+
"dqb1": [
36+
"r_dpb1_typ1",
37+
"r_dpb1_typ2"
38+
]
39+
},
40+
"donor": {
41+
"A": [
42+
"r_a_typ1",
43+
"r_a_typ2"
44+
],
45+
"B": [
46+
"r_b_typ1",
47+
"r_b_typ2"
48+
],
49+
"C": [
50+
"r_c_typ1",
51+
"r_c_typ2"
52+
],
53+
"drb1": [
54+
"r_drb1_typ1",
55+
"r_drb1_typ2"
56+
],
57+
"dqb1": [
58+
"r_dpb1_typ1",
59+
"r_dpb1_typ2"
60+
]
61+
}
62+
},
2963
"redux_type": "lgx",
3064
"reduce_serology": false,
3165
"reduce_v2": true,
@@ -36,10 +70,11 @@
3670
"reduce_XX": false,
3771
"reduce_MAC": true,
3872
"locus_in_allele_name": true,
39-
"keep_locus_in_allele_name": false,
73+
"keep_locus_in_allele_name": true,
4074
"output_file_format": "csv",
4175
"new_column_for_redux": false,
4276
"map_drb345_to_drbx": false,
4377
"apply_compression": "gzip",
78+
"generate_glstring": false
4479
"verbose_log": true
4580
}

scripts/pyard-reduce-csv

Lines changed: 55 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -154,11 +154,10 @@ def reduce(allele, locus, column_name):
154154
return allele
155155

156156

157-
def clean_locus(allele: str, column_name: str = "Unknown") -> str:
157+
def clean_locus(allele: str, locus: str, column_name: str = "Unknown") -> str:
158158
if allele:
159159
# Remove all white spaces
160160
allele = white_space_regex.sub("", allele)
161-
locus = column_name.split("_")[1].upper()
162161
# If the allele comes in as an allele list, apply reduce to all alleles
163162
if "/" in allele:
164163
return "/".join(map(reduce, allele.split("/"), locus, column_name))
@@ -216,32 +215,49 @@ if __name__ == "__main__":
216215
# Read only the columns to be saved.
217216
# Header is the first row
218217
# Don't convert to NAs
219-
df = pd.read_csv(
220-
ard_config["in_csv_filename"],
221-
usecols=ard_config["columns_from_csv"],
222-
header=0,
223-
dtype=str,
224-
keep_default_na=False,
225-
)
218+
try:
219+
df = pd.read_csv(
220+
ard_config["in_csv_filename"],
221+
usecols=ard_config["columns_from_csv"],
222+
header=0,
223+
dtype=str,
224+
keep_default_na=False,
225+
)
226+
except FileNotFoundError as e:
227+
print(f"File not found {ard_config['in_csv_filename']}", file=sys.stderr)
228+
sys.exit(1)
226229

230+
reduce_prefix = "reduced_"
227231
failed_to_reduce_alleles = []
228-
# Reduce each of the specified columns
229-
for column in ard_config["columns_to_reduce_in_csv"]:
230-
if verbose:
231-
print(f"Column:{column} =>")
232-
if ard_config["new_column_for_redux"]:
233-
# insert a new column
234-
new_column_name = f"reduced_{column}"
235-
new_column_index = df.columns.get_loc(column) + 1
236-
# Apply clean_locus function to the column and insert as a new column
237-
df.insert(
238-
new_column_index,
239-
new_column_name,
240-
df[column].apply(clean_locus, column_name=column),
241-
)
242-
else:
243-
# Apply clean_locus function to the column and replace the column
244-
df[column] = df[column].apply(clean_locus, column_name=column)
232+
reduced_column_mappings = {}
233+
locus_column_mapping = ard_config["locus_column_mapping"]
234+
for subject in locus_column_mapping:
235+
reduced_column_mappings[subject] = {}
236+
for locus in locus_column_mapping[subject]:
237+
if locus not in reduced_column_mappings[subject]:
238+
reduced_column_mappings[subject][locus] = []
239+
# Reduce each of the specified columns
240+
locus_columns = locus_column_mapping[subject][locus]
241+
for column in locus_columns:
242+
if verbose:
243+
print(f"Column:{column} =>")
244+
if ard_config["new_column_for_redux"]:
245+
# insert a new column
246+
new_column_name = f"{reduce_prefix}{column}"
247+
new_column_index = df.columns.get_loc(column) + 1
248+
# Apply clean_locus function to the column and insert as a new column
249+
df.insert(
250+
new_column_index,
251+
new_column_name,
252+
df[column].apply(clean_locus, locus=locus, column_name=column),
253+
)
254+
reduced_column_mappings[subject][locus].append(new_column_name)
255+
else:
256+
# Apply clean_locus function to the column and replace the column
257+
df[column] = df[column].apply(
258+
clean_locus, locus=locus, column_name=column
259+
)
260+
reduced_column_mappings[subject][locus].append(column)
245261

246262
# Map DRB3,DRB4,DRB5 to DRBX if specified
247263
# New columns DRBX_1 and DRBX_2 are created
@@ -257,6 +273,19 @@ if __name__ == "__main__":
257273
)
258274
df["DRBX_1"], df["DRBX_2"] = zip(*df_drbx)
259275

276+
if ard_config["generate_glstring"]:
277+
for subject in reduced_column_mappings:
278+
for haplotype_num in range(2):
279+
hap1_columns = list(
280+
map(
281+
lambda x: reduced_column_mappings[subject][x][haplotype_num],
282+
reduced_column_mappings[subject].keys(),
283+
)
284+
)
285+
df[subject + f"_haplotype_{(haplotype_num + 1)}"] = df[
286+
hap1_columns
287+
].agg("~".join, axis=1)
288+
260289
# Save as XLSX if specified
261290
if ard_config["output_file_format"] == "xlsx":
262291
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"

0 commit comments

Comments
 (0)