Skip to content

Commit b9ca634

Browse files
Extra Helper Scripts for Batch processing CSV File (#69)
`reduce_csv.py` : the driver file `conf.py`: configuration `sample.csv`: sample CSV file
1 parent 923f95b commit b9ca634

File tree

4 files changed

+309
-0
lines changed

4 files changed

+309
-0
lines changed

extras/README.md

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Extras
2+
3+
# Batch Script for CSV File
4+
5+
**Example Scripts to batch reduce HLA typings from a CSV File**
6+
7+
`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA
8+
typing data and reduce certain columns and produce a new CSV and Excel file.
9+
10+
For most use case, installing `py-ard`, specifying the changes in `conf.py` file
11+
and running `python reduce_csv.py` will produce result based on the configuration
12+
in the `conf.py`.
13+
14+
15+
```python
16+
#
17+
# configurations for processing CSV files
18+
#
19+
20+
# The column names that are in CSV
21+
# The output file will have these columns
22+
all_columns_in_csv = [
23+
"nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
24+
"r_dpb1_typ1", "r_dpb1_typ2"
25+
]
26+
27+
#
28+
# List of columns which have typing information and need to be reduced.
29+
# The locus is the 2nd term in the column name
30+
# Eg: For column R_DRB1_type1, DPB1 is the locus name
31+
#
32+
columns_to_reduce_in_csv = [
33+
"r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
34+
"r_dpb1_typ2"
35+
]
36+
37+
#
38+
# Configuration options to ARD reduction of a CSV file
39+
#
40+
ard_config = {
41+
# All Columns in the CSV file
42+
"csv_in_column_names": all_columns_in_csv,
43+
44+
# Columns to check for typings
45+
"columns_to_check": columns_to_reduce_in_csv,
46+
47+
# How should the typings be reduced
48+
# Valid Options:
49+
# - G
50+
# - lg
51+
# - lgx
52+
"redux_type": "lgx",
53+
54+
# Input CSV filename
55+
"in_csv_filename": "sample.csv",
56+
57+
# Output CSV filename
58+
"out_csv_filename": 'clean_sample.csv',
59+
60+
# Use compression
61+
# Valid options
62+
# - 'gzip'
63+
# - 'zip'
64+
# - None
65+
"apply_compression": 'gzip',
66+
67+
# Show verbose log
68+
# Valid options:
69+
# - True
70+
# - False
71+
"verbose_log": True,
72+
73+
# What to reduce ?
74+
"reduce_serology": False,
75+
"reduce_v2": True,
76+
"reduce_3field": True,
77+
"reduce_P": True,
78+
"reduce_XX": False,
79+
"reduce_MAC": True,
80+
81+
# Is locus name present in allele
82+
# Eg. A*01:01 vs 01:01
83+
"locus_in_allele_name": False,
84+
85+
# Format
86+
# Valid options:
87+
# - csv
88+
# - xlsx
89+
"output_file_format": 'csv',
90+
91+
# Add a separate column for processed column
92+
"new_column_for_redux": False,
93+
}
94+
```
95+
96+
The included sample CSV file `sample.csv` can be processed using the script.
97+
98+
```shell
99+
100+
```

extras/conf.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#
2+
# configurations for processing CSV files
3+
#
4+
5+
# The column names that are in CSV
6+
# The output file will have these columns
7+
all_columns_in_csv = [
8+
"nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
9+
"r_dpb1_typ1", "r_dpb1_typ2"
10+
]
11+
12+
#
13+
# List of columns which have typing information and need to be reduced.
14+
# The locus is the 2nd term in the column name
15+
# Eg: For column R_DRB1_type1, DPB1 is the locus name
16+
#
17+
columns_to_reduce_in_csv = [
18+
"r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
19+
"r_dpb1_typ2"
20+
]
21+
22+
#
23+
# Configuration options to ARD reduction of a CSV file
24+
#
25+
ard_config = {
26+
# All Columns in the CSV file
27+
"csv_in_column_names": all_columns_in_csv,
28+
29+
# Columns to check for typings
30+
"columns_to_check": columns_to_reduce_in_csv,
31+
32+
# How should the typings be reduced
33+
# Valid Options:
34+
# - G
35+
# - lg
36+
# - lgx
37+
"redux_type": "lgx",
38+
39+
# Input CSV filename
40+
"in_csv_filename": "sample.csv",
41+
42+
# Output CSV filename
43+
"out_csv_filename": 'clean_sample.csv',
44+
45+
# Use compression
46+
# Valid options
47+
# - 'gzip'
48+
# - 'zip'
49+
# - None
50+
"apply_compression": 'gzip',
51+
52+
# Show verbose log
53+
# Valid options:
54+
# - True
55+
# - False
56+
"verbose_log": True,
57+
58+
# What to reduce ?
59+
"reduce_serology": False,
60+
"reduce_v2": True,
61+
"reduce_3field": True,
62+
"reduce_P": True,
63+
"reduce_XX": False,
64+
"reduce_MAC": True,
65+
66+
# Is locus name present in allele
67+
# Eg. A*01:01 vs 01:01
68+
"locus_in_allele_name": False,
69+
70+
# Format
71+
# Valid options:
72+
# - csv
73+
# - xlsx
74+
"output_file_format": 'csv',
75+
76+
# Add a separate column for processed column
77+
"new_column_for_redux": False,
78+
}

extras/reduce_csv.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#
2+
#
3+
# Quick script to reduce alleles from a CSV file
4+
#
5+
# Use `conf.py` to setup configurations that's used here
6+
# For Excel output, openpyxl library needs to be installed.
7+
# pip install openpyxl
8+
#
9+
10+
import pandas as pd
11+
import pyard
12+
import re
13+
14+
from conf import ard_config
15+
16+
verbose = ard_config["verbose_log"]
17+
white_space_regex = re.compile(r"\s+")
18+
19+
20+
def is_serology(allele: str) -> bool:
21+
if len(allele.split(':')) == 1:
22+
return True
23+
24+
25+
def is_3field(allele: str) -> bool:
26+
return len(allele.split(':')) > 2
27+
28+
29+
def is_P(allele: str) -> bool:
30+
if allele.endswith('P'):
31+
fields = allele.split(':')
32+
if len(fields) == 2: # Ps are 2 fields
33+
return fields[0].isdigit() and fields[0].isdigit()
34+
return False
35+
36+
37+
def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
38+
if allele:
39+
# Remove all white spaces
40+
allele = white_space_regex.sub('', allele)
41+
locus = column_name.split('_')[1].upper()
42+
# If the allele comes in as an allele list, apply reduce to all alleles
43+
if '/' in allele:
44+
return "/".join(map(reduce, allele.split('/'), locus))
45+
else:
46+
return reduce(allele, locus)
47+
return allele
48+
49+
50+
def should_be_reduced(allele, locus_allele):
51+
if is_serology(allele):
52+
return ard_config["reduce_serology"]
53+
54+
if ard_config["reduce_v2"]:
55+
if ard.is_v2(locus_allele):
56+
return True
57+
58+
if ard_config["reduce_3field"]:
59+
if is_3field(locus_allele):
60+
return True
61+
62+
if ard_config["reduce_P"]:
63+
if is_P(allele):
64+
return True
65+
66+
if ard_config["reduce_XX"]:
67+
if ard.is_XX(locus_allele):
68+
return True
69+
70+
if ard_config["reduce_MAC"]:
71+
if ard.is_mac(locus_allele) and not ard.is_XX(locus_allele):
72+
return True
73+
74+
return False
75+
76+
77+
def reduce(allele, locus):
78+
# Does the allele name have the locus in it ?
79+
if '*' in allele:
80+
locus_allele = allele
81+
elif ard_config["locus_in_allele_name"]:
82+
locus_allele = allele
83+
else:
84+
locus_allele = f"{locus}*{allele}"
85+
86+
# Check the config if this allele should be reduced
87+
if should_be_reduced(allele, locus_allele):
88+
# print(f"reducing '{locus_allele}'")
89+
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
90+
# print(f"reduced to '{reduced_allele}'")
91+
if reduced_allele:
92+
allele = "/".join(map(lambda a: a.split('*')[1],
93+
reduced_allele.split('/')))
94+
else:
95+
if verbose:
96+
print(f"Failed to reduce {locus_allele}")
97+
98+
if verbose:
99+
print(f"\t{locus_allele} => {allele}")
100+
return allele
101+
102+
103+
if __name__ == '__main__':
104+
ard = pyard.ARD(remove_invalid=False)
105+
106+
df = pd.read_csv(ard_config["in_csv_filename"], names=ard_config["csv_in_column_names"], header=0, dtype=str)
107+
df.fillna('', inplace=True)
108+
109+
for column in ard_config["columns_to_check"]:
110+
if verbose:
111+
print(f"Column:{column} =>")
112+
if ard_config["new_column_for_redux"]:
113+
# insert a new column
114+
new_column_name = f"reduced_{column}"
115+
new_column_index = df.columns.get_loc(column) + 1
116+
df.insert(new_column_index, new_column_name, df[column].apply(clean_locus, column_name=column))
117+
else:
118+
df[column] = df[column].apply(clean_locus, column_name=column)
119+
120+
if ard_config["output_file_format"] == 'xlsx':
121+
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
122+
df.to_excel(out_file_name, index=False)
123+
else:
124+
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
125+
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
126+
if verbose:
127+
print(f"Saved result to file:{out_file_name}")

extras/sample.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2
2+
123,A*23:71,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
3+
456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
4+
789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01

0 commit comments

Comments
 (0)