Skip to content

Commit b7fa29c

Browse files
committed
Merged from upstream
2 parents 935122d + fbd7cd7 commit b7fa29c

File tree

11 files changed

+358
-15
lines changed

11 files changed

+358
-15
lines changed

README.rst

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -89,9 +89,8 @@ Example
8989
# 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02'
9090
9191
# py-ard can also reduce serology based typings
92-
ard.redux_gl('HLA-A*10^HLA-A*9', 'lg')
93-
# >>> ard_gl
94-
# 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g'
92+
ard.redux_gl('B14', 'lg')
93+
# >>> 'B*14:01g/B*14:02g/B*14:03g/B*14:04g/B*14:05g/B*14:06g/B*14:08g/B*14:09g/B*14:10g/B*14:11g/B*14:12g/B*14:13g/B*14:14g/B*14:15g/B*14:16g/B*14:17g/B*14:18g/B*14:19g/B*14:20g/B*14:21g/B*14:22g/B*14:23g/B*14:24g/B*14:25g/B*14:26g/B*14:27g/B*14:28g/B*14:29g/B*14:30g/B*14:31g/B*14:32g/B*14:33g/B*14:34g/B*14:35g/B*14:36g/B*14:37g/B*14:38g/B*14:39g/B*14:40g/B*14:42g/B*14:43g/B*14:44g/B*14:45g/B*14:46g/B*14:47g/B*14:48g/B*14:49g/B*14:50g/B*14:51g/B*14:52g/B*14:53g/B*14:54g/B*14:55g/B*14:56g/B*14:57g/B*14:58g/B*14:59g/B*14:60g/B*14:62g/B*14:63g/B*14:65g/B*14:66g/B*14:68g/B*14:70Qg/B*14:71g/B*14:73g/B*14:74g/B*14:75g/B*14:77g/B*14:82g/B*14:83g/B*14:86g/B*14:87g/B*14:88g/B*14:90g/B*14:93g/B*14:94g/B*14:95g/B*14:96g/B*14:97g/B*14:99g/B*14:102g'
9594
9695
9796

extras/README.md

Lines changed: 100 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,100 @@
1+
# Extras
2+
3+
# Batch Script for CSV File
4+
5+
**Example Scripts to batch reduce HLA typings from a CSV File**
6+
7+
`reduce_csv.py` and `conf.py` scripts can be used to take a CSV file with HLA
8+
typing data and reduce certain columns and produce a new CSV and Excel file.
9+
10+
For most use case, installing `py-ard`, specifying the changes in `conf.py` file
11+
and running `python reduce_csv.py` will produce result based on the configuration
12+
in the `conf.py`.
13+
14+
15+
```python
16+
#
17+
# configurations for processing CSV files
18+
#
19+
20+
# The column names that are in CSV
21+
# The output file will have these columns
22+
all_columns_in_csv = [
23+
"nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
24+
"r_dpb1_typ1", "r_dpb1_typ2"
25+
]
26+
27+
#
28+
# List of columns which have typing information and need to be reduced.
29+
# The locus is the 2nd term in the column name
30+
# Eg: For column R_DRB1_type1, DPB1 is the locus name
31+
#
32+
columns_to_reduce_in_csv = [
33+
"r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
34+
"r_dpb1_typ2"
35+
]
36+
37+
#
38+
# Configuration options to ARD reduction of a CSV file
39+
#
40+
ard_config = {
41+
# All Columns in the CSV file
42+
"csv_in_column_names": all_columns_in_csv,
43+
44+
# Columns to check for typings
45+
"columns_to_check": columns_to_reduce_in_csv,
46+
47+
# How should the typings be reduced
48+
# Valid Options:
49+
# - G
50+
# - lg
51+
# - lgx
52+
"redux_type": "lgx",
53+
54+
# Input CSV filename
55+
"in_csv_filename": "sample.csv",
56+
57+
# Output CSV filename
58+
"out_csv_filename": 'clean_sample.csv',
59+
60+
# Use compression
61+
# Valid options
62+
# - 'gzip'
63+
# - 'zip'
64+
# - None
65+
"apply_compression": 'gzip',
66+
67+
# Show verbose log
68+
# Valid options:
69+
# - True
70+
# - False
71+
"verbose_log": True,
72+
73+
# What to reduce ?
74+
"reduce_serology": False,
75+
"reduce_v2": True,
76+
"reduce_3field": True,
77+
"reduce_P": True,
78+
"reduce_XX": False,
79+
"reduce_MAC": True,
80+
81+
# Is locus name present in allele
82+
# Eg. A*01:01 vs 01:01
83+
"locus_in_allele_name": False,
84+
85+
# Format
86+
# Valid options:
87+
# - csv
88+
# - xlsx
89+
"output_file_format": 'csv',
90+
91+
# Add a separate column for processed column
92+
"new_column_for_redux": False,
93+
}
94+
```
95+
96+
The included sample CSV file `sample.csv` can be processed using the script.
97+
98+
```shell
99+
100+
```

extras/conf.py

Lines changed: 78 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,78 @@
1+
#
2+
# configurations for processing CSV files
3+
#
4+
5+
# The column names that are in CSV
6+
# The output file will have these columns
7+
all_columns_in_csv = [
8+
"nmdp_id", "r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2",
9+
"r_dpb1_typ1", "r_dpb1_typ2"
10+
]
11+
12+
#
13+
# List of columns which have typing information and need to be reduced.
14+
# The locus is the 2nd term in the column name
15+
# Eg: For column R_DRB1_type1, DPB1 is the locus name
16+
#
17+
columns_to_reduce_in_csv = [
18+
"r_a_typ1", "r_a_typ2", "r_b_typ1", "r_b_typ2", "r_c_typ1", "r_c_typ2", "r_drb1_typ1", "r_drb1_typ2", "r_dpb1_typ1",
19+
"r_dpb1_typ2"
20+
]
21+
22+
#
23+
# Configuration options to ARD reduction of a CSV file
24+
#
25+
ard_config = {
26+
# All Columns in the CSV file
27+
"csv_in_column_names": all_columns_in_csv,
28+
29+
# Columns to check for typings
30+
"columns_to_check": columns_to_reduce_in_csv,
31+
32+
# How should the typings be reduced
33+
# Valid Options:
34+
# - G
35+
# - lg
36+
# - lgx
37+
"redux_type": "lgx",
38+
39+
# Input CSV filename
40+
"in_csv_filename": "sample.csv",
41+
42+
# Output CSV filename
43+
"out_csv_filename": 'clean_sample.csv',
44+
45+
# Use compression
46+
# Valid options
47+
# - 'gzip'
48+
# - 'zip'
49+
# - None
50+
"apply_compression": 'gzip',
51+
52+
# Show verbose log
53+
# Valid options:
54+
# - True
55+
# - False
56+
"verbose_log": True,
57+
58+
# What to reduce ?
59+
"reduce_serology": False,
60+
"reduce_v2": True,
61+
"reduce_3field": True,
62+
"reduce_P": True,
63+
"reduce_XX": False,
64+
"reduce_MAC": True,
65+
66+
# Is locus name present in allele
67+
# Eg. A*01:01 vs 01:01
68+
"locus_in_allele_name": False,
69+
70+
# Format
71+
# Valid options:
72+
# - csv
73+
# - xlsx
74+
"output_file_format": 'csv',
75+
76+
# Add a separate column for processed column
77+
"new_column_for_redux": False,
78+
}

extras/reduce_csv.py

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
#
2+
#
3+
# Quick script to reduce alleles from a CSV file
4+
#
5+
# Use `conf.py` to setup configurations that's used here
6+
# For Excel output, openpyxl library needs to be installed.
7+
# pip install openpyxl
8+
#
9+
10+
import pandas as pd
11+
import pyard
12+
import re
13+
14+
from conf import ard_config
15+
16+
verbose = ard_config["verbose_log"]
17+
white_space_regex = re.compile(r"\s+")
18+
19+
20+
def is_serology(allele: str) -> bool:
21+
if len(allele.split(':')) == 1:
22+
return True
23+
24+
25+
def is_3field(allele: str) -> bool:
26+
return len(allele.split(':')) > 2
27+
28+
29+
def is_P(allele: str) -> bool:
30+
if allele.endswith('P'):
31+
fields = allele.split(':')
32+
if len(fields) == 2: # Ps are 2 fields
33+
return fields[0].isdigit() and fields[0].isdigit()
34+
return False
35+
36+
37+
def clean_locus(allele: str, column_name: str = 'Unknown') -> str:
38+
if allele:
39+
# Remove all white spaces
40+
allele = white_space_regex.sub('', allele)
41+
locus = column_name.split('_')[1].upper()
42+
# If the allele comes in as an allele list, apply reduce to all alleles
43+
if '/' in allele:
44+
return "/".join(map(reduce, allele.split('/'), locus))
45+
else:
46+
return reduce(allele, locus)
47+
return allele
48+
49+
50+
def should_be_reduced(allele, locus_allele):
51+
if is_serology(allele):
52+
return ard_config["reduce_serology"]
53+
54+
if ard_config["reduce_v2"]:
55+
if ard.is_v2(locus_allele):
56+
return True
57+
58+
if ard_config["reduce_3field"]:
59+
if is_3field(locus_allele):
60+
return True
61+
62+
if ard_config["reduce_P"]:
63+
if is_P(allele):
64+
return True
65+
66+
if ard_config["reduce_XX"]:
67+
if ard.is_XX(locus_allele):
68+
return True
69+
70+
if ard_config["reduce_MAC"]:
71+
if ard.is_mac(locus_allele) and not ard.is_XX(locus_allele):
72+
return True
73+
74+
return False
75+
76+
77+
def reduce(allele, locus):
78+
# Does the allele name have the locus in it ?
79+
if '*' in allele:
80+
locus_allele = allele
81+
elif ard_config["locus_in_allele_name"]:
82+
locus_allele = allele
83+
else:
84+
locus_allele = f"{locus}*{allele}"
85+
86+
# Check the config if this allele should be reduced
87+
if should_be_reduced(allele, locus_allele):
88+
# print(f"reducing '{locus_allele}'")
89+
reduced_allele = ard.redux_gl(locus_allele, ard_config["redux_type"])
90+
# print(f"reduced to '{reduced_allele}'")
91+
if reduced_allele:
92+
allele = "/".join(map(lambda a: a.split('*')[1],
93+
reduced_allele.split('/')))
94+
else:
95+
if verbose:
96+
print(f"Failed to reduce {locus_allele}")
97+
98+
if verbose:
99+
print(f"\t{locus_allele} => {allele}")
100+
return allele
101+
102+
103+
if __name__ == '__main__':
104+
ard = pyard.ARD(remove_invalid=False)
105+
106+
df = pd.read_csv(ard_config["in_csv_filename"], names=ard_config["csv_in_column_names"], header=0, dtype=str)
107+
df.fillna('', inplace=True)
108+
109+
for column in ard_config["columns_to_check"]:
110+
if verbose:
111+
print(f"Column:{column} =>")
112+
if ard_config["new_column_for_redux"]:
113+
# insert a new column
114+
new_column_name = f"reduced_{column}"
115+
new_column_index = df.columns.get_loc(column) + 1
116+
df.insert(new_column_index, new_column_name, df[column].apply(clean_locus, column_name=column))
117+
else:
118+
df[column] = df[column].apply(clean_locus, column_name=column)
119+
120+
if ard_config["output_file_format"] == 'xlsx':
121+
out_file_name = f"{ard_config['out_csv_filename']}.xlsx"
122+
df.to_excel(out_file_name, index=False)
123+
else:
124+
out_file_name = f"{ard_config['out_csv_filename'] + '.gz' if ard_config['apply_compression'] else ''}"
125+
df.to_csv(out_file_name, index=False, compression=ard_config["apply_compression"])
126+
if verbose:
127+
print(f"Saved result to file:{out_file_name}")

extras/sample.csv

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
nmdp_id,r_a_typ1,r_a_typ2,r_b_typ1,r_b_typ2,r_c_typ1,r_c_typ2,r_drb1_typ1,r_drb1_typ2,r_dpb1_typ1,r_dpb1_typ2
2+
123,A*23:71,A*29:79,B*18:67,B*51:275,C*05:01:19,C*02:85:02,DRB1*03:03,DRB1*14:144,DPB1*193:01:01,DPB1*582:01:01
3+
456,A*01:01:42,A*30:12:02,B*44:02:32,B*35:42,C*03:148,C*04:322,DRB1*13:01:16,DRB1*15:80N,DPB1*914:01:01,DPB1*278:01:01
4+
789,A*02:247,A*03:227,B*15:570,B*07:02:01:17,C*16:01:10,C*06:102,DRB1*13:156,DRB1*14:167:01,DPB1*405:01:01,DPB1*479:01:01

pyard/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@
2424
from .pyard import ARD
2525

2626
__author__ = """NMDP Bioinformatics"""
27-
__version__ = '0.6.1'
27+
__version__ = '0.6.2'

pyard/broad_splits.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@
2828
#
2929
# Mapping Generated from `dna_relshp.csv` file
3030
#
31-
broad_splits_mapping = {
31+
broad_splits_dna_mapping = {
3232
'A*09': ['A*23', 'A*24'],
3333
'A*10': ['A*25', 'A*26', 'A*34', 'A*66'],
3434
'A*19': ['A*29', 'A*30', 'A*31', 'A*32', 'A*33', 'A*74'],
@@ -45,3 +45,23 @@
4545
'DRB1*02': ['DRB1*15', 'DRB1*16'],
4646
'DRB1*06': ['DRB1*13', 'DRB1*14']
4747
}
48+
broad_splits_ser_mapping = {
49+
'A9': ['A23', 'A24'],
50+
'A10': ['A25', 'A26', 'A34', 'A66'],
51+
'A19': ['A29', 'A30', 'A31', 'A32', 'A33', 'A74'],
52+
'A28': ['A68', 'A69'],
53+
'B5': ['B51', 'B52'],
54+
'B12': ['B44', 'B45'],
55+
'B15': ['B62', 'B63', 'B75', 'B76', 'B77'],
56+
'B16': ['B38', 'B39'],
57+
'B17': ['B57', 'B58'],
58+
'B21': ['B49', 'B50'],
59+
'B22': ['B54', 'B55', 'B56'],
60+
'B40': ['B60', 'B61'],
61+
'B70': ['B71', 'B72'],
62+
'DQ1': ['DQ5', 'DQ6'],
63+
'DR2': ['DR15', 'DR16'],
64+
'DR3': ['DR17', 'DR18'],
65+
'DR5': ['DR11', 'DR12'],
66+
'DR6': ['DR13', 'DR14']
67+
}

0 commit comments

Comments
 (0)