Skip to content

Commit 63a11a5

Browse files
committed
Support Serology
- Gets WMDA `rel_dna_ser.txt` for the corresponding version of IMGT database to produce serology mapping. 0 and ? are ignored. - Save mapped data to the `serology_mapping` table - Add Serology Gherkin test
1 parent 270b0e0 commit 63a11a5

File tree

6 files changed

+198
-77
lines changed

6 files changed

+198
-77
lines changed

README.rst

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,15 +72,19 @@ Example
7272
allele = "A*01:01:01"
7373
7474
ard.redux(allele, 'G')
75-
# >> 'A*01:01:01G'
75+
# 'A*01:01:01G'
7676
7777
ard.redux(allele, 'lg')
78-
# >> 'A*01:01g'
78+
# 'A*01:01g'
7979
8080
ard.redux(allele, 'lgx')
8181
# 'A*01:01'
8282
8383
ard.redux_gl("A*01:01/A*01:01N+A*02:AB^B*07:02+B*07:AB", "G")
8484
# 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02'
8585
86+
# py-ard can also reduce serology based typings
87+
ard.redux_gl('HLA-A*10^HLA-A*9', 'lg')
88+
# 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g'
89+
8690

pyard/data_repository.py

Lines changed: 105 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import functools
12
import sqlite3
23

34
import pandas as pd
@@ -6,6 +7,8 @@
67
from pyard.broad_splits import broad_splits_mapping
78

89
# GitHub URL where IMGT HLA files are downloaded.
10+
from pyard.smart_sort import smart_sort_comparator
11+
912
IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'
1013

1114
# List of expression characters
@@ -97,65 +100,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
97100
return dup_g, g_group, lg_group, lgx_group
98101

99102

100-
def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool):
101-
"""
102-
MAC files come in 2 different versions:
103-
104-
Martin: when they’re printed, the first is better for encoding and the
105-
second is better for decoding. The entire list was maintained both as an
106-
excel spreadsheet and also as a sybase database table. The excel was the
107-
one that was printed and distributed.
108-
109-
**==> numer.v3.txt <==**
110-
111-
Sorted by the length and the the values in the list
112-
```
113-
"LAST UPDATED: 09/30/20"
114-
CODE SUBTYPE
115-
116-
AB 01/02
117-
AC 01/03
118-
AD 01/04
119-
AE 01/05
120-
AG 01/06
121-
AH 01/07
122-
AJ 01/08
123-
```
124-
125-
**==> alpha.v3.txt <==**
126-
127-
Sorted by the code
128-
129-
```
130-
"LAST UPDATED: 10/01/20"
131-
* CODE SUBTYPE
132-
133-
AA 01/02/03/05
134-
AB 01/02
135-
AC 01/03
136-
AD 01/04
137-
AE 01/05
138-
AF 01/09
139-
AG 01/06
140-
```
141-
142-
:param db_connection:
143-
:param data_dir:
144-
:return:
145-
"""
146-
mac_table_name = 'mac_codes'
147-
if refresh_mac or not db.table_exists(db_connection, mac_table_name):
148-
# Load the MAC file to a DataFrame
149-
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
150-
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip',
151-
skiprows=3, names=['Code', 'Alleles'])
152-
# Create a dict from code to alleles
153-
mac = df_mac.set_index("Code")["Alleles"].to_dict()
154-
# Save the mac dict to db
155-
db.save_dict(db_connection, table_name=mac_table_name,
156-
dictionary=mac, columns=('code', 'alleles'))
157-
158-
159103
def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
160104
"""
161105
Checks to see if there's already an allele list file for the `imgt_version`
@@ -226,10 +170,110 @@ def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_versio
226170
else:
227171
xx_codes[broad] = xx_codes[split]
228172

229-
# Save this version of the valid alleles and xx codes
173+
# Save this version of the valid alleles
230174
db.save_set(db_connection, 'alleles', valid_alleles, 'allele')
231-
flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()}
175+
# Save this version of xx codes
176+
flat_xx_codes = {k: '/'.join(sorted(v, key=functools.cmp_to_key(smart_sort_comparator)))
177+
for k, v in xx_codes.items()}
232178
db.save_dict(db_connection, 'xx_codes', flat_xx_codes,
233179
('allele_1d', 'allele_list'))
234180

235181
return valid_alleles, xx_codes
182+
183+
184+
def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool):
185+
"""
186+
MAC files come in 2 different versions:
187+
188+
Martin: when they’re printed, the first is better for encoding and the
189+
second is better for decoding. The entire list was maintained both as an
190+
excel spreadsheet and also as a sybase database table. The excel was the
191+
one that was printed and distributed.
192+
193+
**==> numer.v3.txt <==**
194+
195+
Sorted by the length and the the values in the list
196+
```
197+
"LAST UPDATED: 09/30/20"
198+
CODE SUBTYPE
199+
200+
AB 01/02
201+
AC 01/03
202+
AD 01/04
203+
AE 01/05
204+
AG 01/06
205+
AH 01/07
206+
AJ 01/08
207+
```
208+
209+
**==> alpha.v3.txt <==**
210+
211+
Sorted by the code
212+
213+
```
214+
"LAST UPDATED: 10/01/20"
215+
* CODE SUBTYPE
216+
217+
AA 01/02/03/05
218+
AB 01/02
219+
AC 01/03
220+
AD 01/04
221+
AE 01/05
222+
AF 01/09
223+
AG 01/06
224+
```
225+
226+
:param db_connection: Database connection to the sqlite database
227+
:param refresh_mac: Refresh the database with newer MAC data ?
228+
:return: None
229+
"""
230+
mac_table_name = 'mac_codes'
231+
if refresh_mac or not db.table_exists(db_connection, mac_table_name):
232+
# Load the MAC file to a DataFrame
233+
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
234+
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip',
235+
skiprows=3, names=['Code', 'Alleles'])
236+
# Create a dict from code to alleles
237+
mac = df_mac.set_index("Code")["Alleles"].to_dict()
238+
# Save the mac dict to db
239+
db.save_dict(db_connection, table_name=mac_table_name,
240+
dictionary=mac, columns=('code', 'alleles'))
241+
242+
243+
def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
244+
if not db.table_exists(db_connection, 'serology_mapping'):
245+
# Load WMDA serology mapping data
246+
rel_dna_ser_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/rel_dna_ser.txt'
247+
df_sero = pd.read_csv(rel_dna_ser_url, sep=';', skiprows=6,
248+
names=['Locus', 'Allele', 'USA', 'PSA', 'ASA'],
249+
index_col=False)
250+
251+
# Remove 0 and ?
252+
df_sero = df_sero[(df_sero != '0') & (df_sero != '?')]
253+
df_sero['Allele'] = df_sero['Locus'] + df_sero['Allele']
254+
255+
usa = df_sero[['Locus', 'Allele', 'USA']].dropna()
256+
usa['Sero'] = usa['Locus'] + usa['USA']
257+
258+
psa = df_sero[['Locus', 'Allele', 'PSA']].dropna()
259+
psa['PSA'] = psa['PSA'].apply(lambda row: row.split('/'))
260+
psa = psa.explode('PSA')
261+
psa = psa[(psa != '0') & (psa != '?')].dropna()
262+
psa['Sero'] = psa['Locus'] + psa['PSA']
263+
264+
asa = df_sero[['Locus', 'Allele', 'ASA']].dropna()
265+
asa['ASA'] = asa['ASA'].apply(lambda x: x.split('/'))
266+
asa = asa.explode('ASA')
267+
asa = asa[(asa != '0') & (asa != '?')].dropna()
268+
asa['Sero'] = asa['Locus'] + asa['ASA']
269+
270+
sero_mapping_combined = pd.concat([usa[['Sero', 'Allele']],
271+
psa[['Sero', 'Allele']],
272+
asa[['Sero', 'Allele']]])
273+
sero_mapping = sero_mapping_combined.groupby('Sero').\
274+
apply(lambda x: '/'.join(sorted(x['Allele']))).\
275+
to_dict()
276+
277+
# Save the serology mapping to db
278+
db.save_dict(db_connection, table_name='serology_mapping',
279+
dictionary=sero_mapping, columns=('serology', 'allele_list'))

pyard/db.py

Lines changed: 20 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,25 @@ def mac_code_to_alleles(connection: sqlite3.Connection, code: str) -> List[str]:
8383
return alleles
8484

8585

86+
def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[str]:
87+
"""
88+
Look up Serology in the database and return corresponding list of alleles.
89+
90+
:param connection: db connection of type sqlite.Connection
91+
:param serology: Serology
92+
:return: List of alleles
93+
"""
94+
serology_query = "SELECT allele_list from serology_mapping where serology = ?"
95+
cursor = connection.execute(serology_query, (serology, ))
96+
result = cursor.fetchone()
97+
cursor.close()
98+
if result:
99+
alleles = result[0].split('/')
100+
else:
101+
alleles = None
102+
return alleles
103+
104+
86105
def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool:
87106
"""
88107
Check db if the MAC code exists.
@@ -196,4 +215,4 @@ def load_dict(connection: sqlite3.Connection, table_name: str, columns: Tuple[st
196215
cursor.execute(select_all_query)
197216
table_as_dict = {k: v for k, v in cursor.fetchall()}
198217
cursor.close()
199-
return table_as_dict
218+
return table_as_dict

pyard/pyard.py

Lines changed: 38 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from typing import Iterable
2727

2828
from . import db
29-
from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes
29+
from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \
30+
generate_serology_mapping
3031
from .db import is_valid_mac_code, mac_code_to_alleles
3132
from .smart_sort import smart_sort_comparator
3233

@@ -63,6 +64,8 @@ def __init__(self, imgt_version: str = 'Latest',
6364
self.valid_alleles, self.xx_codes = generate_alleles_and_xx_codes(self.db_connection, imgt_version)
6465
# Load ARS mappings
6566
self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version)
67+
# Load Serology mappings
68+
generate_serology_mapping(self.db_connection, imgt_version)
6669

6770
# Close the current read-write db connection
6871
self.db_connection.close()
@@ -169,36 +172,54 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
169172
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]),
170173
key=functools.cmp_to_key(smart_sort_comparator)))
171174

175+
# Handle Serology
176+
if self.is_serology(glstring):
177+
if HLA_regex.search(glstring):
178+
# Remove HLA- prefix
179+
serology = glstring.split("-")[1]
180+
alleles = self._get_alleles_from_serology(serology)
181+
alleles = ['HLA-' + a for a in alleles]
182+
else:
183+
alleles = self._get_alleles_from_serology(glstring)
184+
return self.redux_gl("/".join(alleles), redux_type)
185+
172186
loc_allele = glstring.split(":")
173187
loc_name, code = loc_allele[0], loc_allele[1]
174188

175-
# handle XX codes
176-
# test that they are valid_alleles
189+
# Handle XX codes
177190
if (self.is_mac(glstring) and glstring.split(":")[1] == "XX") and loc_name in self.xx_codes:
178-
return self.redux_gl(
179-
"/".join(sorted(self.xx_codes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
191+
return self.redux_gl("/".join(self.xx_codes[loc_name]), redux_type)
180192

193+
# Handle MAC
181194
if self.is_mac(glstring) and is_valid_mac_code(self.db_connection, code):
182195
if HLA_regex.search(glstring):
183-
hla, allele_name = glstring.split("-")
196+
# Remove HLA- prefix
197+
allele_name = glstring.split("-")[1]
184198
loc_name, code = allele_name.split(":")
185199
alleles = self._get_alleles(code, loc_name)
186-
return self.redux_gl(
187-
"/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))),
188-
redux_type)
200+
alleles = ["HLA-" + a for a in alleles]
189201
else:
190202
alleles = self._get_alleles(code, loc_name)
191-
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))),
192-
redux_type)
203+
return self.redux_gl("/".join(alleles), redux_type)
204+
193205
return self.redux(glstring, redux_type)
194206

207+
@staticmethod
208+
def is_serology(allele: str) -> bool:
209+
"""
210+
An allele is serology if the allele name after * is numeral only, no ':'
211+
:param allele: The allele to test for serology
212+
:return: True if serology
213+
"""
214+
return allele.split('*')[1].isdigit()
215+
195216
@staticmethod
196217
def is_mac(gl: str) -> bool:
197218
"""
198219
MAC has there are non-digit characters after the : character,
199220
then it's a MAC.
200221
:param gl: glstring to test if it has a MAC code
201-
:return: bool
222+
:return: True if MAC
202223
"""
203224
return re.search(r":\D+", gl) is not None
204225

@@ -221,6 +242,10 @@ def _get_alleles(self, code, loc_name) -> Iterable[str]:
221242
return filter(self._is_valid_allele,
222243
[f'{loc_name}:{a}' for a in alleles])
223244

245+
def _get_alleles_from_serology(self, serology) -> Iterable[str]:
246+
alleles = db.serology_to_alleles(self.db_connection, serology)
247+
return filter(self._is_valid_allele, alleles)
248+
224249
def isvalid(self, allele: str) -> bool:
225250
"""
226251
Determines validity of an allele
@@ -230,7 +255,7 @@ def isvalid(self, allele: str) -> bool:
230255
:return: allele or empty
231256
:rtype: bool
232257
"""
233-
if not self.is_mac(allele):
258+
if not self.is_mac(allele) and not self.is_serology(allele):
234259
# Alleles ending with P or G are valid_alleles
235260
if allele.endswith(('P', 'G')):
236261
# remove the last character

tests/features/serology.feature

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
Feature: Serology
2+
3+
py-ard is able to map serology to the corresponding alleles and reduce to the desired
4+
level.
5+
6+
Scenario Outline:
7+
8+
Given the serology typing is <Serology>
9+
When reducing on the <Level> level (ambiguous)
10+
Then the reduced allele is found to be <Redux Allele>
11+
12+
13+
Examples: Valid A serology typings
14+
| Serology | Level | Redux Allele |
15+
| A*10 | G | A*26:01:01G/A*26:10/A*26:15/A*26:92/A*66:01:01G/A*66:03:01G |
16+
| A*10 | lg | A*26:01g/A*26:10g/A*26:15g/A*26:92g/A*66:01g/A*66:03g |
17+
| A*10 | lgx | A*26:01/A*26:10/A*26:15/A*26:92/A*66:01/A*66:03 |
18+
19+
Examples: With HLA- prefix
20+
| Serology | Level | Redux Allele |
21+
| HLA-A*10 | G | HLA-A*26:01:01G/HLA-A*26:10/HLA-A*26:15/HLA-A*26:92/HLA-A*66:01:01G/HLA-A*66:03:01G |
22+
| HLA-B*15:03 | G | HLA-B*15:03:01G |
23+
| HLA-DQB1*1 | G | HLA-DQB1*06:11:01/HLA-DQB1*06:11:02/HLA-DQB1*06:11:03/HLA-DQB1*06:12 |
24+
| HLA-DQB1*1 | lg | HLA-DQB1*06:11g/HLA-DQB1*06:12g |

tests/steps/redux_allele.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -22,3 +22,8 @@ def step_impl(context, level):
2222
@then('the reduced allele is found to be {redux_allele}')
2323
def step_impl(context, redux_allele):
2424
assert_that(context.redux_allele, is_(redux_allele))
25+
26+
27+
@given("the serology typing is {serology}")
28+
def step_impl(context, serology):
29+
context.allele = serology

0 commit comments

Comments
 (0)