Skip to content

Commit 8985ae3

Browse files
committed
Read dup_* tables when db already exists.
Collapse all ars mapping to ARSMapping tuple
1 parent f392a0d commit 8985ae3

File tree

1 file changed

+19
-6
lines changed

1 file changed

+19
-6
lines changed

pyard/data_repository.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
# > http://www.fsf.org/licensing/licenses/lgpl.html
2121
# > http://www.opensource.org/licenses/lgpl-license.php
2222
#
23+
from collections import namedtuple
2324
import functools
2425
import sqlite3
2526

@@ -37,6 +38,9 @@
3738
# List of expression characters
3839
expression_chars = ['N', 'Q', 'L', 'S']
3940

41+
ars_mapping_tables = ['dup_g', 'dup_lg', 'dup_lgx', 'g_group', 'lg_group', 'lgx_group']
42+
ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)
43+
4044

4145
def get_n_field_allele(allele: str, n: int) -> str:
4246
"""
@@ -64,12 +68,15 @@ def get_2field_allele(a: str) -> str:
6468

6569

6670
def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
67-
if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']):
71+
if db.tables_exists(db_connection, ars_mapping_tables):
6872
dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group'))
73+
dup_lg = db.load_dict(db_connection, table_name='dup_lg', columns=('allele', 'lg_group'))
74+
dup_lgx = db.load_dict(db_connection, table_name='dup_lgx', columns=('allele', 'lgx_group'))
6975
g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g'))
7076
lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg'))
7177
lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx'))
72-
return dup_g, g_group, lg_group, lgx_group
78+
return ARSMapping(dup_g=dup_g, dup_lg=dup_lg, dup_lgx=dup_lgx,
79+
g_group=g_group, lg_group=lg_group, lgx_group=lgx_group)
7380

7481
ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt'
7582
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
@@ -84,32 +91,36 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
8491
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
8592
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))
8693

94+
# multiple Gs
8795
mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts()
8896
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list()
8997

98+
# Keep only the alleles that have more than 1 mapping
9099
dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \
91100
.drop_duplicates() \
92101
.groupby('2d', as_index=True).agg("/".join) \
93102
.to_dict()['G']
94103

104+
# multiple lg
95105
mlg = df.drop_duplicates(['2d', 'lg'])['2d'].value_counts()
96106
multiple_lg_list = mlg[mlg > 1].reset_index()['index'].to_list()
97107

108+
# Keep only the alleles that have more than 1 mapping
98109
dup_lg = df[df['2d'].isin(multiple_lg_list)][['lg', '2d']] \
99110
.drop_duplicates() \
100111
.groupby('2d', as_index=True).agg("/".join) \
101112
.to_dict()['lg']
102113

114+
# multiple lgx
103115
mlgx = df.drop_duplicates(['2d', 'lgx'])['2d'].value_counts()
104116
multiple_lgx_list = mlgx[mlgx > 1].reset_index()['index'].to_list()
105117

118+
# Keep only the alleles that have more than 1 mapping
106119
dup_lgx = df[df['2d'].isin(multiple_lgx_list)][['lgx', '2d']] \
107120
.drop_duplicates() \
108121
.groupby('2d', as_index=True).agg("/".join) \
109122
.to_dict()['lgx']
110123

111-
112-
113124
# Creating dictionaries with mac_code->ARS group mapping
114125
df_g = pd.concat([
115126
df[['2d', 'G']].rename(columns={'2d': 'A'}),
@@ -139,7 +150,8 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
139150
db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg'))
140151
db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx'))
141152

142-
return dup_g, dup_lg, dup_lgx, g_group, lg_group, lgx_group
153+
return ARSMapping(dup_g=dup_g, dup_lg=dup_lg, dup_lgx=dup_lgx,
154+
g_group=g_group, lg_group=lg_group, lgx_group=lgx_group)
143155

144156

145157
def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
@@ -350,7 +362,8 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
350362

351363
# re-sort allele lists into smartsort order
352364
for sero in sero_mapping.keys():
353-
sero_mapping[sero] = '/'.join(sorted(sero_mapping[sero].split('/'), key=functools.cmp_to_key(smart_sort_comparator)))
365+
sero_mapping[sero] = '/'.join(
366+
sorted(sero_mapping[sero].split('/'), key=functools.cmp_to_key(smart_sort_comparator)))
354367

355368
# Save the serology mapping to db
356369
db.save_dict(db_connection, table_name='serology_mapping',

0 commit comments

Comments
 (0)