2626import pathlib
2727import pickle
2828import re
29- import urllib .request
3029from functools import partial
3130from operator import is_not
3231from typing import Dict
3534
3635from .broad_splits import broad_splits_mapping
3736from .smart_sort import smart_sort_comparator
38- from .util import pandas_explode
3937
4038# The GitHub URL where IMGT HLA files are downloaded.
4139IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'
@@ -78,6 +76,7 @@ def get_2field_allele(a: str) -> str:
7876
7977class ARD (object ):
8078 """ ARD reduction for HLA """
79+
8180 def __init__ (self , dbversion : str = 'Latest' ,
8281 load_mac_file : bool = True ,
8382 verbose : bool = False ,
@@ -90,92 +89,77 @@ def __init__(self, dbversion: str = 'Latest',
9089 self ._load_mac_file = load_mac_file
9190 self ._remove_invalid = remove_invalid
9291
93-
94- # TODO: add check for valid_alleles ARD type
95- # TODO: add check for valid_alleles db version
96-
9792 # Set data directory where all the downloaded files will go
9893 if data_dir is None :
99- data_dir = os .path .dirname (__file__ )
100- else :
101- pathlib .Path (data_dir ).mkdir (exist_ok = True )
94+ data_dir = pathlib .Path .home () / ".pyard"
10295
103- ars_url = IMGT_HLA_URL + dbversion + '/wmda/hla_nom_g.txt'
104- ars_file = data_dir + '/hla_nom_g.' + str (dbversion ) + ".txt"
105- # Downloading ARS file
106- if not os .path .isfile (ars_file ):
107- if verbose :
108- logging .info ("Downloading " + str (dbversion ) + " ARD file" )
109- urllib .request .urlretrieve (ars_url , ars_file )
96+ data_dir = f'{ data_dir } /{ dbversion } '
97+ pathlib .Path (data_dir ).mkdir (parents = True , exist_ok = True )
11098
11199 # Load MAC codes
112100 if load_mac_file :
113101 self .generate_mac_codes (data_dir )
114102 # Load Alleles and XX Codes
115103 self .generate_alleles_and_xxcodes (dbversion , data_dir )
104+ # Load ARS mappings
105+ self .generate_ars_mapping (data_dir )
116106
117- # Loading ARS file into pandas
118- # TODO: Make skip dynamic in case the files are not consistent
119- df = pd .read_csv (ars_file , skiprows = 6 ,
120- names = ["Locus" , "A" , "G" ], sep = ";" ).dropna ()
107+ def generate_ars_mapping (self , data_dir ):
121108
122- df ['Locus' ] = df ['Locus' ].apply (lambda l : l .split ("*" )[0 ])
123- df ['A' ] = df [['Locus' , 'A' ]].apply (lambda row : [row ['Locus' ] + "*" + a
124- for a in
125- row ['A' ].split ("/" )
126- ],
127- axis = 1 )
128- df ['G' ] = df [['Locus' , 'G' ]].apply (lambda row : "*" .join ([row ['Locus' ],
129- row ['G' ]]),
130- axis = 1 )
109+ mapping_file = f'{ data_dir } /ars_mapping.pickle'
110+ if os .path .isfile (mapping_file ):
111+ with open (mapping_file , 'rb' ) as load_file :
112+ ars_mapping = pickle .load (load_file )
113+ self ._G , self ._lg , self ._lgx , self .dup_g = ars_mapping
114+ return
131115
132- df = pandas_explode (df , 'A' )
116+ ars_url = f'{ IMGT_HLA_URL } { self ._dbversion } /wmda/hla_nom_g.txt'
117+ df = pd .read_csv (ars_url , skiprows = 6 , names = ["Locus" , "A" , "G" ], sep = ";" ).dropna ()
118+
119+ df ['A' ] = df ['A' ].apply (lambda a : a .split ('/' ))
120+ df = df .explode ('A' )
121+ df ['A' ] = df ['Locus' ] + df ['A' ]
122+ df ['G' ] = df ['Locus' ] + df ['G' ]
133123
134124 df ['2d' ] = df ['A' ].apply (get_2field_allele )
135125 df ['3d' ] = df ['A' ].apply (get_3field_allele )
136126
137- df_values = df .drop_duplicates (['2d' , 'G' ])['2d' ] \
138- .value_counts ().reset_index () \
139- .sort_values (by = '2d' , ascending = False )
140- multiple_Glist = df_values [df_values ['2d' ] > 1 ]['index' ].tolist ()
141- self .dup_g = df [df ['2d' ].isin (multiple_Glist )][['G' , '2d' ]] \
127+ mg = df .drop_duplicates (['2d' , 'G' ])['2d' ].value_counts ()
128+ multiple_g_list = mg [mg > 1 ].reset_index ()['index' ].to_list ()
129+
130+ self .dup_g = df [df ['2d' ].isin (multiple_g_list )][['G' , '2d' ]] \
142131 .drop_duplicates () \
143132 .groupby ('2d' , as_index = True ).agg ("/" .join ) \
144133 .to_dict ()['G' ]
145134
146- df ['lg' ] = df ['G' ].apply (lambda a :
147- ":" .join (a .split (":" )[0 :2 ]) + "g" )
148-
149- df ['lgx' ] = df ['G' ].apply (lambda a :
150- ":" .join (a .split (":" )[0 :2 ]))
135+ df ['lg' ] = df ['G' ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]) + "g" )
136+ df ['lgx' ] = df ['G' ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]))
151137
152138 # Creating dictionaries with allele->ARS group mapping
153- self ._G = pd .concat ([df .drop (['A' , 'lg' , 'lgx' , '3d' ], axis = 1 )
154- .rename (index = str ,
155- columns = {"2d" : "A" })[['A' , 'G' ]],
156- df .drop (['A' , 'lg' , 'lgx' , '2d' ], axis = 1 )
157- .rename (index = str ,
158- columns = {"3d" : "A" })[['A' , 'G' ]],
159- df [['A' , 'G' ]]],
160- ignore_index = True ).set_index ('A' ).to_dict ()['G' ]
161-
162- self ._lg = pd .concat ([df .drop (['A' , 'G' , 'lgx' , '3d' ], axis = 1 )
163- .rename (index = str ,
164- columns = {"2d" : "A" })[['A' , 'lg' ]],
165- df .drop (['A' , 'G' , 'lgx' , '2d' ], axis = 1 )
166- .rename (index = str ,
167- columns = {"3d" : "A" })[['A' , 'lg' ]],
168- df [['A' , 'lg' ]]],
169- ignore_index = True ).set_index ('A' ).to_dict ()['lg' ]
170-
171- self ._lgx = pd .concat ([df .drop (['A' , 'lg' , 'G' , '3d' ], axis = 1 )
172- .rename (index = str ,
173- columns = {"2d" : "A" })[['A' , 'lgx' ]],
174- df .drop (['A' , 'lg' , 'G' , '2d' ], axis = 1 )
175- .rename (index = str ,
176- columns = {"3d" : "A" })[['A' , 'lgx' ]],
177- df [['A' , 'lgx' ]]],
178- ignore_index = True ).set_index ('A' ).to_dict ()['lgx' ]
139+ df_G = pd .concat ([
140+ df [['2d' , 'G' ]].rename (columns = {'2d' : 'A' }),
141+ df [['3d' , 'G' ]].rename (columns = {'3d' : 'A' }),
142+ df [['A' , 'G' ]]
143+ ], ignore_index = True )
144+ self ._G = df_G .set_index ('A' )['G' ].to_dict ()
145+
146+ df_lg = pd .concat ([
147+ df [['2d' , 'lg' ]].rename (columns = {'2d' : 'A' }),
148+ df [['3d' , 'lg' ]].rename (columns = {'3d' : 'A' }),
149+ df [['A' , 'lg' ]]
150+ ])
151+ self ._lg = df_lg .set_index ('A' )['lg' ].to_dict ()
152+
153+ df_lgx = pd .concat ([
154+ df [['2d' , 'lgx' ]].rename (columns = {'2d' : 'A' }),
155+ df [['3d' , 'lgx' ]].rename (columns = {'3d' : 'A' }),
156+ df [['A' , 'lgx' ]]
157+ ])
158+ self ._lgx = df_lgx .set_index ('A' )['lgx' ].to_dict ()
159+
160+ ars_mapping = (self ._G , self ._lg , self ._lgx , self .dup_g )
161+ with open (mapping_file , 'wb' ) as save_file :
162+ pickle .dump (ars_mapping , save_file , protocol = pickle .HIGHEST_PROTOCOL )
179163
180164 def generate_mac_codes (self , data_dir ):
181165 """
@@ -287,7 +271,10 @@ def generate_alleles_and_xxcodes(self, dbversion: str, data_dir: str) -> None:
287271
288272 # Create a Pandas DataFrame from the allele list file
289273 # Skip the header (first 6 lines) and use only the Allele
290- allele_list_url = f'{ IMGT_HLA_URL } Latest/Allelelist.{ dbversion } .txt'
274+ if dbversion == "Latest" :
275+ allele_list_url = f'{ IMGT_HLA_URL } Latest/Allelelist.txt'
276+ else :
277+ allele_list_url = f'{ IMGT_HLA_URL } Latest/Allelelist.{ dbversion } .txt'
291278 allele_df = pd .read_csv (allele_list_url , header = 6 , usecols = ['Allele' ])
292279
293280 # Create a set of valid alleles
0 commit comments