2020# > http://www.fsf.org/licensing/licenses/lgpl.html
2121# > http://www.opensource.org/licenses/lgpl-license.php
2222#
23+ from collections import namedtuple
2324import functools
2425import sqlite3
2526
3738# List of expression characters
3839expression_chars = ['N' , 'Q' , 'L' , 'S' ]
3940
41+ ars_mapping_tables = ['dup_g' , 'dup_lg' , 'dup_lgx' , 'g_group' , 'lg_group' , 'lgx_group' ]
42+ ARSMapping = namedtuple ("ARSMapping" , ars_mapping_tables )
43+
4044
4145def get_n_field_allele (allele : str , n : int ) -> str :
4246 """
@@ -64,12 +68,15 @@ def get_2field_allele(a: str) -> str:
6468
6569
6670def generate_ars_mapping (db_connection : sqlite3 .Connection , imgt_version ):
67- if db .tables_exists (db_connection , [ 'dup_g' , 'g_group' , 'lg_group' , 'lgx_group' ] ):
71+ if db .tables_exists (db_connection , ars_mapping_tables ):
6872 dup_g = db .load_dict (db_connection , table_name = 'dup_g' , columns = ('allele' , 'g_group' ))
73+ dup_lg = db .load_dict (db_connection , table_name = 'dup_lg' , columns = ('allele' , 'lg_group' ))
74+ dup_lgx = db .load_dict (db_connection , table_name = 'dup_lgx' , columns = ('allele' , 'lgx_group' ))
6975 g_group = db .load_dict (db_connection , table_name = 'g_group' , columns = ('allele' , 'g' ))
7076 lg_group = db .load_dict (db_connection , table_name = 'lg_group' , columns = ('allele' , 'lg' ))
7177 lgx_group = db .load_dict (db_connection , table_name = 'lgx_group' , columns = ('allele' , 'lgx' ))
72- return dup_g , g_group , lg_group , lgx_group
78+ return ARSMapping (dup_g = dup_g , dup_lg = dup_lg , dup_lgx = dup_lgx ,
79+ g_group = g_group , lg_group = lg_group , lgx_group = lgx_group )
7380
7481 ars_url = f'{ IMGT_HLA_URL } { imgt_version } /wmda/hla_nom_g.txt'
7582 df = pd .read_csv (ars_url , skiprows = 6 , names = ["Locus" , "A" , "G" ], sep = ";" ).dropna ()
@@ -81,17 +88,38 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
8188
8289 df ['2d' ] = df ['A' ].apply (get_2field_allele )
8390 df ['3d' ] = df ['A' ].apply (get_3field_allele )
91+ df ['lg' ] = df ['G' ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]) + "g" )
92+ df ['lgx' ] = df ['G' ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]))
8493
94+ # multiple Gs
8595 mg = df .drop_duplicates (['2d' , 'G' ])['2d' ].value_counts ()
8696 multiple_g_list = mg [mg > 1 ].reset_index ()['index' ].to_list ()
8797
98+ # Keep only the alleles that have more than 1 mapping
8899 dup_g = df [df ['2d' ].isin (multiple_g_list )][['G' , '2d' ]] \
89100 .drop_duplicates () \
90101 .groupby ('2d' , as_index = True ).agg ("/" .join ) \
91102 .to_dict ()['G' ]
92103
93- df ['lg' ] = df ['G' ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]) + "g" )
94- df ['lgx' ] = df ['G' ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]))
104+ # multiple lg
105+ mlg = df .drop_duplicates (['2d' , 'lg' ])['2d' ].value_counts ()
106+ multiple_lg_list = mlg [mlg > 1 ].reset_index ()['index' ].to_list ()
107+
108+ # Keep only the alleles that have more than 1 mapping
109+ dup_lg = df [df ['2d' ].isin (multiple_lg_list )][['lg' , '2d' ]] \
110+ .drop_duplicates () \
111+ .groupby ('2d' , as_index = True ).agg ("/" .join ) \
112+ .to_dict ()['lg' ]
113+
114+ # multiple lgx
115+ mlgx = df .drop_duplicates (['2d' , 'lgx' ])['2d' ].value_counts ()
116+ multiple_lgx_list = mlgx [mlgx > 1 ].reset_index ()['index' ].to_list ()
117+
118+ # Keep only the alleles that have more than 1 mapping
119+ dup_lgx = df [df ['2d' ].isin (multiple_lgx_list )][['lgx' , '2d' ]] \
120+ .drop_duplicates () \
121+ .groupby ('2d' , as_index = True ).agg ("/" .join ) \
122+ .to_dict ()['lgx' ]
95123
96124 # Creating dictionaries with mac_code->ARS group mapping
97125 df_g = pd .concat ([
@@ -116,11 +144,14 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
116144 lgx_group = df_lgx .set_index ('A' )['lgx' ].to_dict ()
117145
118146 db .save_dict (db_connection , table_name = 'dup_g' , dictionary = dup_g , columns = ('allele' , 'g_group' ))
147+ db .save_dict (db_connection , table_name = 'dup_lg' , dictionary = dup_lg , columns = ('allele' , 'lg_group' ))
148+ db .save_dict (db_connection , table_name = 'dup_lgx' , dictionary = dup_lgx , columns = ('allele' , 'lgx_group' ))
119149 db .save_dict (db_connection , table_name = 'g_group' , dictionary = g_group , columns = ('allele' , 'g' ))
120150 db .save_dict (db_connection , table_name = 'lg_group' , dictionary = lg_group , columns = ('allele' , 'lg' ))
121151 db .save_dict (db_connection , table_name = 'lgx_group' , dictionary = lgx_group , columns = ('allele' , 'lgx' ))
122152
123- return dup_g , g_group , lg_group , lgx_group
153+ return ARSMapping (dup_g = dup_g , dup_lg = dup_lg , dup_lgx = dup_lgx ,
154+ g_group = g_group , lg_group = lg_group , lgx_group = lgx_group )
124155
125156
126157def generate_alleles_and_xx_codes (db_connection : sqlite3 .Connection , imgt_version ):
@@ -331,7 +362,8 @@ def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version):
331362
332363 # re-sort allele lists into smartsort order
333364 for sero in sero_mapping .keys ():
334- sero_mapping [sero ] = '/' .join (sorted (sero_mapping [sero ].split ('/' ), key = functools .cmp_to_key (smart_sort_comparator )))
365+ sero_mapping [sero ] = '/' .join (
366+ sorted (sero_mapping [sero ].split ('/' ), key = functools .cmp_to_key (smart_sort_comparator )))
335367
336368 # Save the serology mapping to db
337369 db .save_dict (db_connection , table_name = 'serology_mapping' ,
0 commit comments