Skip to content

Commit 724ba18

Browse files
committed
Merge branch 'mmaiers-nmdp-littlep'
2 parents b7fa29c + 7fb7f8e commit 724ba18

File tree

5 files changed

+79
-39
lines changed

5 files changed

+79
-39
lines changed

pyard/data_repository.py

Lines changed: 38 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# > http://www.fsf.org/licensing/licenses/lgpl.html
2121
# > http://www.opensource.org/licenses/lgpl-license.php
2222
#
23+
from collections import namedtuple
2324
import functools
2425
import sqlite3
2526

@@ -37,6 +38,9 @@
3738
# List of expression characters
3839
expression_chars = ['N', 'Q', 'L', 'S']
3940

41+
ars_mapping_tables = ['dup_g', 'dup_lg', 'dup_lgx', 'g_group', 'lg_group', 'lgx_group']
42+
ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)
43+
4044

4145
def get_n_field_allele(allele: str, n: int) -> str:
4246
"""
@@ -64,12 +68,15 @@ def get_2field_allele(a: str) -> str:
6468

6569

6670
def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
67-
if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']):
71+
if db.tables_exists(db_connection, ars_mapping_tables):
6872
dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group'))
73+
dup_lg = db.load_dict(db_connection, table_name='dup_lg', columns=('allele', 'lg_group'))
74+
dup_lgx = db.load_dict(db_connection, table_name='dup_lgx', columns=('allele', 'lgx_group'))
6975
g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g'))
7076
lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg'))
7177
lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx'))
72-
return dup_g, g_group, lg_group, lgx_group
78+
return ARSMapping(dup_g=dup_g, dup_lg=dup_lg, dup_lgx=dup_lgx,
79+
g_group=g_group, lg_group=lg_group, lgx_group=lgx_group)
7380

7481
ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt'
7582
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
@@ -81,17 +88,38 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
8188

8289
df['2d'] = df['A'].apply(get_2field_allele)
8390
df['3d'] = df['A'].apply(get_3field_allele)
91+
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
92+
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))
8493

94+
# multiple Gs
8595
mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts()
8696
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list()
8797

98+
# Keep only the alleles that have more than 1 mapping
8899
dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \
89100
.drop_duplicates() \
90101
.groupby('2d', as_index=True).agg("/".join) \
91102
.to_dict()['G']
92103

93-
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
94-
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))
104+
# multiple lg
105+
mlg = df.drop_duplicates(['2d', 'lg'])['2d'].value_counts()
106+
multiple_lg_list = mlg[mlg > 1].reset_index()['index'].to_list()
107+
108+
# Keep only the alleles that have more than 1 mapping
109+
dup_lg = df[df['2d'].isin(multiple_lg_list)][['lg', '2d']] \
110+
.drop_duplicates() \
111+
.groupby('2d', as_index=True).agg("/".join) \
112+
.to_dict()['lg']
113+
114+
# multiple lgx
115+
mlgx = df.drop_duplicates(['2d', 'lgx'])['2d'].value_counts()
116+
multiple_lgx_list = mlgx[mlgx > 1].reset_index()['index'].to_list()
117+
118+
# Keep only the alleles that have more than 1 mapping
119+
dup_lgx = df[df['2d'].isin(multiple_lgx_list)][['lgx', '2d']] \
120+
.drop_duplicates() \
121+
.groupby('2d', as_index=True).agg("/".join) \
122+
.to_dict()['lgx']
95123

96124
# Creating dictionaries with mac_code->ARS group mapping
97125
df_g = pd.concat([
@@ -116,11 +144,14 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
116144
lgx_group = df_lgx.set_index('A')['lgx'].to_dict()
117145

118146
db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group'))
147+
db.save_dict(db_connection, table_name='dup_lg', dictionary=dup_lg, columns=('allele', 'lg_group'))
148+
db.save_dict(db_connection, table_name='dup_lgx', dictionary=dup_lgx, columns=('allele', 'lgx_group'))
119149
db.save_dict(db_connection, table_name='g_group', dictionary=g_group, columns=('allele', 'g'))
120150
db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg'))
121151
db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx'))
122152

123-
return dup_g, g_group, lg_group, lgx_group
153+
return ARSMapping(dup_g=dup_g, dup_lg=dup_lg, dup_lgx=dup_lgx,
154+
g_group=g_group, lg_group=lg_group, lgx_group=lgx_group)
124155

125156

126157
def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
@@ -331,7 +362,8 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
331362

332363
# re-sort allele lists into smartsort order
333364
for sero in sero_mapping.keys():
334-
sero_mapping[sero] = '/'.join(sorted(sero_mapping[sero].split('/'), key=functools.cmp_to_key(smart_sort_comparator)))
365+
sero_mapping[sero] = '/'.join(
366+
sorted(sero_mapping[sero].split('/'), key=functools.cmp_to_key(smart_sort_comparator)))
335367

336368
# Save the serology mapping to db
337369
db.save_dict(db_connection, table_name='serology_mapping',

pyard/pyard.py

Lines changed: 21 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,8 @@
2626
from typing import Iterable
2727

2828
from . import db
29-
from .data_repository import generate_ars_mapping, generate_mac_codes, generate_alleles_and_xx_codes, \
29+
from .data_repository import generate_ars_mapping, \
30+
generate_mac_codes, generate_alleles_and_xx_codes, \
3031
generate_serology_mapping, generate_v2_to_v3_mapping
3132
from .db import is_valid_mac_code, mac_code_to_alleles, v2_to_v3_allele
3233
from .smart_sort import smart_sort_comparator
@@ -63,7 +64,7 @@ def __init__(self, imgt_version: str = 'Latest',
6364
# Load Alleles and XX Codes
6465
self.valid_alleles, self.xx_codes = generate_alleles_and_xx_codes(self.db_connection, imgt_version)
6566
# Load ARS mappings
66-
self.dup_g, self._G, self._lg, self._lgx = generate_ars_mapping(self.db_connection, imgt_version)
67+
self.ars_mappings = generate_ars_mapping(self.db_connection, imgt_version)
6768
# Load Serology mappings
6869
generate_serology_mapping(self.db_connection, imgt_version)
6970
# Load V2 to V3 mappings
@@ -111,21 +112,25 @@ def redux(self, allele: str, ars_type: str) -> str:
111112
if allele.endswith(('P', 'G')):
112113
allele = allele[:-1]
113114

114-
if ars_type == "G" and allele in self._G:
115-
if allele in self.dup_g:
116-
return self.dup_g[allele]
115+
if ars_type == "G" and allele in self.ars_mappings.g_group:
116+
if allele in self.ars_mappings.dup_g:
117+
return self.ars_mappings.dup_g[allele]
117118
else:
118-
return self._G[allele]
119+
return self.ars_mappings.g_group[allele]
119120
elif ars_type == "lg":
120-
if allele in self._lg:
121-
return self._lg[allele]
121+
if allele in self.ars_mappings.dup_lg:
122+
return self.ars_mappings.dup_lg[allele]
123+
elif allele in self.ars_mappings.lg_group:
124+
return self.ars_mappings.lg_group[allele]
122125
else:
123126
# for 'lg' when allele is not in G group,
124127
# return allele with only first 2 field
125128
return ':'.join(allele.split(':')[0:2]) + 'g'
126129
elif ars_type == "lgx":
127-
if allele in self._lgx:
128-
return self._lgx[allele]
130+
if allele in self.ars_mappings.dup_lgx:
131+
return self.ars_mappings.dup_lgx[allele]
132+
elif allele in self.ars_mappings.lgx_group:
133+
return self.ars_mappings.lgx_group[allele]
129134
else:
130135
# for 'lgx' when allele is not in G group,
131136
# return allele with only first 2 field
@@ -296,7 +301,8 @@ def _get_alleles_from_serology(self, serology) -> Iterable[str]:
296301
else:
297302
return alleles
298303

299-
def _combine_with_colon(self, digits_field):
304+
@staticmethod
305+
def _combine_with_colon(digits_field):
300306
num_of_digits = len(digits_field)
301307
return ':'.join(digits_field[i:i + 2] for i in range(0, num_of_digits, 2))
302308

@@ -424,11 +430,11 @@ def toG(self, allele: str) -> str:
424430
:return: ARS G reduced allele
425431
:rtype: str
426432
"""
427-
if allele in self._G:
428-
if allele in self.dup_g:
429-
return self.dup_g[allele]
433+
if allele in self.ars_mappings.g_group:
434+
if allele in self.ars_mappings.dup_g:
435+
return self.ars_mappings.dup_g[allele]
430436
else:
431-
return self._G[allele]
437+
return self.ars_mappings.g_group[allele]
432438
else:
433439
return "X"
434440

tests/environment.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,4 +2,4 @@
22

33

44
def before_all(context):
5-
context.ard = ARD('3290', data_dir='/tmp/py-ard')
5+
context.ard = ARD('3440', data_dir='/tmp/py-ard')

tests/features/allele.feature

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -7,17 +7,19 @@ Feature: Alleles
77
Then the reduced allele is found to be <Redux Allele>
88

99
Examples:
10-
| Allele | Level | Redux Allele |
11-
| A*01:01:01 | G | A*01:01:01G |
12-
| A*01:01:01 | lg | A*01:01g |
13-
| A*01:01:01 | lgx | A*01:01 |
10+
| Allele | Level | Redux Allele |
11+
| A*01:01:01 | G | A*01:01:01G |
12+
| A*01:01:01 | lg | A*01:01g |
13+
| A*01:01:01 | lgx | A*01:01 |
1414

15-
| HLA-A*01:01:01 | G | HLA-A*01:01:01G |
16-
| HLA-A*01:01:01 | lg | HLA-A*01:01g |
17-
| HLA-A*01:01:01 | lgx | HLA-A*01:01 |
15+
| HLA-A*01:01:01 | G | HLA-A*01:01:01G |
16+
| HLA-A*01:01:01 | lg | HLA-A*01:01g |
17+
| HLA-A*01:01:01 | lgx | HLA-A*01:01 |
1818

19-
| DRB1*14:05:01 | lgx | DRB1*14:05 |
20-
| DRB1*14:05:01 | lg | DRB1*14:05g |
19+
| DRB1*14:05:01 | lgx | DRB1*14:05 |
20+
| DRB1*14:05:01 | lg | DRB1*14:05g |
2121

22-
| DRB1*14:06:01 | lgx | DRB1*14:06 |
23-
| DRB1*14:06:01 | lg | DRB1*14:06g |
22+
| DRB1*14:06:01 | lgx | DRB1*14:06 |
23+
| DRB1*14:06:01 | lg | DRB1*14:06g |
24+
| C*02:02 | lg | C*02:02g/C*02:10g |
25+
| C*02:02 | lgx | C*02:02/C*02:10 |

0 commit comments

Comments
 (0)