Skip to content

Commit 9017299

Browse files
authored
Merge pull request #67 from pbashyal-nmdp/setup_scripts
Version 2 Nomenclature and pyard tools
2 parents 5d7d42b + 2cc7f25 commit 9017299

File tree

13 files changed

+364
-20
lines changed

13 files changed

+364
-20
lines changed

README.rst

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -88,3 +88,31 @@ Example
8888
# 'HLA-A*24:19g/HLA-A*24:22g^HLA-A*26:01g/HLA-A*26:10g/HLA-A*26:15g/HLA-A*26:92g/HLA-A*66:01g/HLA-A*66:03g'
8989
9090
91+
Command Line Tools
92+
------------------
93+
94+
.. code-block:: bash
95+
96+
# Import the latest IMGT database
97+
$ pyard-import
98+
Created Latest py-ard database
99+
100+
# Import particular version of IMGT database
101+
$ pyard-import --import-db-version 3.29.0
102+
Created py-ard version 3290 database
103+
104+
# Import particular version of IMGT database and
105+
# replace the v2 to v3 mapping table
106+
$ pyard-import --import-db-version 3.29.0 --v2-to-v3-mapping map2to3.csv
107+
Created py-ard version 3290 database
108+
Updated v2_mapping table with 'map2to3.csv' mapping file.
109+
110+
# Reduce a gl string from command line
111+
$ pyard --gl 'A*01:AB' -r lgx
112+
A*01:01/A*01:02
113+
114+
$ pyard --gl 'DRB1*08:XX' -r G
115+
DRB1*08:01:01G/DRB1*08:02:01G/DRB1*08:03:02G/DRB1*08:04:01G/DRB1*08:05/ ...
116+
117+
$ pyard -v 3290 --gl 'A1' -r lgx
118+
A*01:01/A*01:02/A*01:03/A*01:06/A*01:07/A*01:08/A*01:09/A*01:10/A*01:12/ ...

pyard/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@
2424
from .pyard import ARD
2525

2626
__author__ = """NMDP Bioinformatics"""
27-
__version__ = '0.5.1'
27+
__version__ = '0.6.0'

pyard/data_repository.py

Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# py-ard
4+
# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
5+
#
6+
# This library is free software; you can redistribute it and/or modify it
7+
# under the terms of the GNU Lesser General Public License as published
8+
# by the Free Software Foundation; either version 3 of the License, or (at
9+
# your option) any later version.
10+
#
11+
# This library is distributed in the hope that it will be useful, but WITHOUT
12+
# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
13+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14+
# License for more details.
15+
#
16+
# You should have received a copy of the GNU Lesser General Public License
17+
# along with this library; if not, write to the Free Software Foundation,
18+
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
19+
#
20+
# > http://www.fsf.org/licensing/licenses/lgpl.html
21+
# > http://www.opensource.org/licenses/lgpl-license.php
22+
#
123
import functools
224
import sqlite3
325

@@ -299,3 +321,25 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
299321
# Save the serology mapping to db
300322
db.save_dict(db_connection, table_name='serology_mapping',
301323
dictionary=sero_mapping, columns=('serology', 'allele_list'))
324+
325+
326+
def generate_v2_to_v3_mapping(db_connection: sqlite3.Connection, imgt_version):
327+
if not db.table_exists(db_connection, 'v2_mapping'):
328+
# TODO: Create mapping table using both the allele list history and
329+
# deleted alleles as reference.
330+
# Temporary Example
331+
v2_to_v3_example = {
332+
"A*0104": "A*01:04N",
333+
"A*0105N": "A*01:04N",
334+
"A*0111": "A*01:11N",
335+
"A*01123": "A*01:123N",
336+
"A*0115": "A*01:15N",
337+
"A*0116": "A*01:16N",
338+
"A*01160": "A*01:160N",
339+
"A*01162": "A*01:162N",
340+
"A*01178": "A*01:178N",
341+
"A*01179": "A*01:179N",
342+
"DRB5*02ZB": "DRB5*02:UTV",
343+
}
344+
db.save_dict(db_connection, table_name='v2_mapping',
345+
dictionary=v2_to_v3_example, columns=('v2', 'v3'))

pyard/db.py

Lines changed: 41 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,25 @@
1+
# -*- coding: utf-8 -*-
2+
#
3+
# py-ard
4+
# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
5+
#
6+
# This library is free software; you can redistribute it and/or modify it
7+
# under the terms of the GNU Lesser General Public License as published
8+
# by the Free Software Foundation; either version 3 of the License, or (at
9+
# your option) any later version.
10+
#
11+
# This library is distributed in the hope that it will be useful, but WITHOUT
12+
# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
13+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
14+
# License for more details.
15+
#
16+
# You should have received a copy of the GNU Lesser General Public License
17+
# along with this library; if not, write to the Free Software Foundation,
18+
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
19+
#
20+
# > http://www.fsf.org/licensing/licenses/lgpl.html
21+
# > http://www.opensource.org/licenses/lgpl-license.php
22+
#
123
import pathlib
224
import sqlite3
325
from typing import Tuple, Dict, Set, List
@@ -92,7 +114,7 @@ def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[s
92114
:return: List of alleles
93115
"""
94116
serology_query = "SELECT allele_list from serology_mapping where serology = ?"
95-
cursor = connection.execute(serology_query, (serology, ))
117+
cursor = connection.execute(serology_query, (serology,))
96118
result = cursor.fetchone()
97119
cursor.close()
98120
if result:
@@ -102,6 +124,23 @@ def serology_to_alleles(connection: sqlite3.Connection, serology: str) -> List[s
102124
return alleles
103125

104126

127+
def v2_to_v3_allele(connection: sqlite3.Connection, v2_allele: str) -> str:
128+
"""
129+
Look up V3 version of the allele in the database.
130+
131+
:param connection: db connection of type sqlite.Connection
132+
:param v2_allele: V2 allele
133+
:return: V3 allele
134+
"""
135+
v2_query = "SELECT v3 from v2_mapping where v2 = ?"
136+
cursor = connection.execute(v2_query, (v2_allele,))
137+
result = cursor.fetchone()
138+
cursor.close()
139+
if result:
140+
return result[0]
141+
return ''
142+
143+
105144
def is_valid_mac_code(connection: sqlite3.Connection, code: str) -> bool:
106145
"""
107146
Check db if the MAC code exists.
@@ -215,4 +254,4 @@ def load_dict(connection: sqlite3.Connection, table_name: str, columns: Tuple[st
215254
cursor.execute(select_all_query)
216255
table_as_dict = {k: v for k, v in cursor.fetchall()}
217256
cursor.close()
218-
return table_as_dict
257+
return table_as_dict

pyard/pyard.py

Lines changed: 34 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,8 +27,8 @@
2727

2828
from . import db
2929
from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \
30-
generate_serology_mapping
31-
from .db import is_valid_mac_code, mac_code_to_alleles
30+
generate_serology_mapping, generate_v2_to_v3_mapping
31+
from .db import is_valid_mac_code, mac_code_to_alleles, v2_to_v3_allele
3232
from .smart_sort import smart_sort_comparator
3333

3434
HLA_regex = re.compile("^HLA-")
@@ -66,6 +66,8 @@ def __init__(self, imgt_version: str = 'Latest',
6666
self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version)
6767
# Load Serology mappings
6868
generate_serology_mapping(self.db_connection, imgt_version)
69+
# Load V2 to V3 mappings
70+
generate_v2_to_v3_mapping(self.db_connection, imgt_version)
6971

7072
# Close the current read-write db connection
7173
self.db_connection.close()
@@ -172,6 +174,11 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
172174
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]),
173175
key=functools.cmp_to_key(smart_sort_comparator)))
174176

177+
# Handle V2 to V3 mapping
178+
if self.is_v2(glstring):
179+
glstring = self._map_v2_to_v3(glstring)
180+
return self.redux_gl(glstring, redux_type)
181+
175182
# Handle Serology
176183
if self.is_serology(glstring):
177184
alleles = self._get_alleles_from_serology(glstring)
@@ -232,6 +239,17 @@ def is_mac(gl: str) -> bool:
232239
"""
233240
return re.search(r":\D+", gl) is not None
234241

242+
@staticmethod
243+
def is_v2(allele: str) -> bool:
244+
"""
245+
Version 2 of the nomenclature is a single field.
246+
It does not have any ':' field separator.
247+
Eg: A*0104
248+
:param allele: Possible allele
249+
:return: Is the allele in V2 nomenclature
250+
"""
251+
return '*' in allele and not ':' in allele
252+
235253
def _is_valid_allele(self, allele):
236254
"""
237255
Test if allele is valid in the current imgt database
@@ -255,7 +273,7 @@ def _get_alleles(self, code, locus_antigen) -> Iterable[str]:
255273
# else it's a group expansion
256274
is_allelic_expansion = any([':' in allele for allele in alleles])
257275
if is_allelic_expansion:
258-
locus = locus_antigen.split('*')[0] # Just keep the locus name
276+
locus = locus_antigen.split('*')[0] # Just keep the locus name
259277
alleles = [f'{locus}*{a}' for a in alleles]
260278
else:
261279
alleles = [f'{locus_antigen}:{a}' for a in alleles]
@@ -272,6 +290,14 @@ def _get_alleles_from_serology(self, serology) -> Iterable[str]:
272290
else:
273291
return alleles
274292

293+
def _map_v2_to_v3(self, v2_allele):
294+
"""
295+
Get V3 version of V2 versioned allele
296+
:param v2_allele: V2 versioned allele
297+
:return: V3 versioned allele
298+
"""
299+
return v2_to_v3_allele(self.db_connection, v2_allele)
300+
275301
def isvalid(self, allele: str) -> bool:
276302
"""
277303
Determines validity of an allele
@@ -283,7 +309,9 @@ def isvalid(self, allele: str) -> bool:
283309
"""
284310
if allele == '':
285311
return False
286-
if not self.is_mac(allele) and not self.is_serology(allele):
312+
if not self.is_mac(allele) and \
313+
not self.is_serology(allele) and \
314+
not self.is_v2(allele):
287315
# Alleles ending with P or G are valid_alleles
288316
if allele.endswith(('P', 'G')):
289317
# remove the last character
@@ -330,7 +358,7 @@ def mac_toG(self, allele: str) -> str:
330358
"""
331359
locus_antigen, code = allele.split(":")
332360
if HLA_regex.search(allele):
333-
locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix
361+
locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix
334362
if is_valid_mac_code(self.db_connection, code):
335363
alleles = self._get_alleles(code, locus_antigen)
336364
group = [self.toG(a) for a in alleles]
@@ -370,7 +398,7 @@ def expand_mac(self, mac_code: str):
370398
locus_antigen, code = mac_code.split(":")
371399
if is_valid_mac_code(self.db_connection, code):
372400
if HLA_regex.search(mac_code):
373-
locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix
401+
locus_antigen = locus_antigen.split("-")[1] # Remove HLA- prefix
374402
return ['HLA-' + a for a in self._get_alleles(code, locus_antigen)]
375403
else:
376404
return list(self._get_alleles(code, locus_antigen))

pyard/smart_sort.py

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
import re
2626

2727
expr_regex = re.compile('[NQLSGg]')
28+
glstring_chars = re.compile('[/|+^~]')
2829

2930

3031
@functools.lru_cache(maxsize=1000)
@@ -43,6 +44,13 @@ def smart_sort_comparator(a1, a2):
4344
if a1 == a2:
4445
return 0
4546

47+
# GL String matches
48+
if re.search(glstring_chars, a1) or re.search(glstring_chars, a2):
49+
if a1 > a2:
50+
return 1
51+
else:
52+
return -1
53+
4654
# remove any non-numerics
4755
a1 = re.sub(expr_regex, '', a1)
4856
a2 = re.sub(expr_regex, '', a2)
@@ -92,4 +100,3 @@ def smart_sort_comparator(a1, a2):
92100

93101
# All fields are considered equal after 4th field
94102
return 0
95-

scripts/pyard

Lines changed: 69 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,69 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
#
4+
# py-ard
5+
# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
6+
#
7+
# This library is free software; you can redistribute it and/or modify it
8+
# under the terms of the GNU Lesser General Public License as published
9+
# by the Free Software Foundation; either version 3 of the License, or (at
10+
# your option) any later version.
11+
#
12+
# This library is distributed in the hope that it will be useful, but WITHOUT
13+
# ANY WARRANTY; with out even the implied warranty of MERCHANTABILITY or
14+
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public
15+
# License for more details.
16+
#
17+
# You should have received a copy of the GNU Lesser General Public License
18+
# along with this library; if not, write to the Free Software Foundation,
19+
# Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA.
20+
#
21+
# > http://www.fsf.org/licensing/licenses/lgpl.html
22+
# > http://www.opensource.org/licenses/lgpl-license.php
23+
#
24+
import argparse
25+
26+
import pyard
27+
28+
29+
def get_imgt_version(imgt_version):
30+
if imgt_version:
31+
version = imgt_version.replace('.', '')
32+
if version.isdigit():
33+
return version
34+
raise RuntimeError(f"{imgt_version} is not a valid IMGT database version number")
35+
return None
36+
37+
38+
if __name__ == '__main__':
39+
parser = argparse.ArgumentParser(
40+
usage="""[-v <IMGT DB Version>] [gl-string redux_type]""",
41+
description="""py-ard tool to redux GL String"""
42+
)
43+
parser.add_argument(
44+
"-v",
45+
"--imgt-version",
46+
dest="imgt_version"
47+
)
48+
parser.add_argument(
49+
"--gl",
50+
required=True,
51+
dest="gl_string"
52+
)
53+
parser.add_argument(
54+
"-r",
55+
choices=['G', 'lg', 'lgx'],
56+
required=True,
57+
dest="redux_type"
58+
)
59+
60+
args = parser.parse_args()
61+
62+
imgt_version = get_imgt_version(args.imgt_version)
63+
if imgt_version:
64+
ard = pyard.ARD(imgt_version)
65+
else:
66+
ard = pyard.ARD()
67+
68+
print(ard.redux_gl(args.gl_string, args.redux_type))
69+
del ard

0 commit comments

Comments
 (0)