11# -*- coding: utf-8 -*-
22
33#
4- # pyars pyARS.
5- # Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
4+ # pyard
5+ # Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
66#
77# This library is free software; you can redistribute it and/or modify it
88# under the terms of the GNU Lesser General Public License as published
2626import pickle
2727import urllib .request
2828import pandas as pd
29+ import functools
30+ from .smart_sort import smart_sort_comparator
2931from .util import pandas_explode
3032from .util import all_macs
3133from operator import is_not
3739ismac = lambda x : True if re .search (":\D+" , x ) else False
3840
3941
40- logging . basicConfig ( format = '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ,
41- datefmt = '%m/%d/%Y %I:%M:%S %p' ,
42- level = logging .INFO )
42+ # a module shouldn't decide the logging config; thats up to the calling programo
43+
44+ #logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
4345
4446import string
4547
@@ -108,6 +110,8 @@ def __init__(self, dbversion: str='Latest',
108110 self ._download_mac = download_mac
109111 self ._remove_invalid = remove_invalid
110112
113+ self .HLA_regex = re .compile ("^HLA-" )
114+
111115 # TODO: add check for valid ARD type
112116 # TODO: add check for valid db version
113117
@@ -120,6 +124,7 @@ def __init__(self, dbversion: str='Latest',
120124 allele_file = data_dir + '/AlleleList.' + str (dbversion ) + ".txt"
121125 mac_file = data_dir + "/mac.txt"
122126 mac_pickle = data_dir + "/mac.pickle"
127+ broad_file = data_dir + "/dna_relshp.csv"
123128
124129 allele_url = "https://raw.githubusercontent.com/ANHIG/IMGTHLA/" \
125130 + dbversion + "/Allelelist.txt"
@@ -184,20 +189,42 @@ def __init__(self, dbversion: str='Latest',
184189 dfxx = pd .DataFrame (pd .Series (allele_df ['2d' ].unique ().tolist ()),
185190 columns = ['Allele' ])
186191 dfxx ['1d' ] = dfxx ['Allele' ].apply (lambda x : x .split (":" )[0 ])
192+
193+ # xxcodes maps a first field name to its expansion
187194 self .xxcodes = dfxx .groupby (['1d' ])\
188195 .apply (lambda x : list (x ['Allele' ]))\
189196 .to_dict ()
190197
198+ # defined broad XX codes
199+ dfbroad = pd .read_csv (broad_file , skiprows = 1 , dtype = str ,
200+ names = ["Locus" , "Broad" , "Fam" ], sep = "," ).dropna ()
201+
202+ dictbroad = dfbroad .groupby (['Locus' ,'Broad' ]).apply (lambda x : list (x ['Fam' ])).to_dict ()
203+
204+ for (locus ,broad ) in dictbroad .keys ():
205+ locusbroad = "*" .join ([locus ,broad ])
206+ for split in dictbroad [(locus ,broad )]:
207+ locussplit = "*" .join ([locus ,split ])
208+ if locusbroad in self .xxcodes .keys ():
209+ self .xxcodes [locusbroad ].extend (self .xxcodes [locussplit ])
210+ else :
211+ self .xxcodes [locusbroad ] = self .xxcodes [locussplit ]
212+
191213 allele_df ['3d' ] = allele_df ['Allele' ].apply (lambda a :
192214 ":" .join (a .split (":" )[0 :3 ]) +
193215 list (a )[- 1 ] if list (a )[- 1 ]
194216 in expre_chars and
195217 len (a .split (":" )) > 3
196218 else ":" .join (a .split (":" )[0 :3 ]))
197219
220+ # all alleles are valid and also shortening to 3 and 2 fields
198221 self .valid = list (set (allele_df ['Allele' ].tolist ()
199222 + allele_df ['2d' ].tolist ()
200223 + allele_df ['3d' ].tolist ()))
224+ # use a dict
225+ self .valid_dict = {}
226+ for i in self .valid :
227+ self .valid_dict [i ]= True
201228
202229 # Loading ARS file into pandas
203230 # TODO: Make skip dynamic in case the files are not consistent
@@ -344,6 +371,7 @@ def lgx(self):
344371 """
345372 return self ._lgx
346373
374+ @functools .lru_cache (maxsize = None )
347375 def redux (self , allele : str , ars_type : str ) -> str :
348376 """
349377 Does ARS reduction with allele and ARS type
@@ -356,18 +384,21 @@ def redux(self, allele: str, ars_type: str) -> str:
356384 :rtype: str
357385 """
358386
359- if re .search ("HLA-" , allele ):
387+ # PERFORMANCE: precompiled regex
388+ # dealing with leading HLA-
389+
390+ if self .HLA_regex .search (allele ):
360391 hla , allele_name = allele .split ("-" )
361392 return "-" .join (["HLA" , self .redux (allele_name , ars_type )])
362393
363- if ars_type == "G" and allele in self .G :
394+ if ars_type == "G" and allele in self ._G :
364395 if allele in self .dup_g :
365396 return self .dup_g [allele ]
366397 else :
367398 return self .G [allele ]
368- elif ars_type == "lg" and allele in self .lg :
399+ elif ars_type == "lg" and allele in self ._lg :
369400 return self .lg [allele ]
370- elif ars_type == "lgx" and allele in self .lgx :
401+ elif ars_type == "lgx" and allele in self ._lgx :
371402 return self .lgx [allele ]
372403 else :
373404 if self .remove_invalid :
@@ -378,6 +409,7 @@ def redux(self, allele: str, ars_type: str) -> str:
378409 else :
379410 return allele
380411
412+ @functools .lru_cache (maxsize = None )
381413 def redux_gl (self , glstring : str , redux_type : str ) -> str :
382414 """
383415 Does ARS reduction with allele and ARS type
@@ -394,25 +426,27 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
394426 return ""
395427
396428 if re .search ("\^" , glstring ):
397- return "^" .join (sorted (set ([self .redux_gl (a , redux_type ) for a in glstring .split ("^" )]), key = functools .cmp_to_key (loci_sort )))
429+ return "^" .join (sorted (set ([self .redux_gl (a , redux_type ) for a in glstring .split ("^" )]), key = functools .cmp_to_key (smart_sort_comparator )))
398430
399431 if re .search ("\|" , glstring ):
400- return "|" .join (sorted (set ([self .redux_gl (a , redux_type ) for a in glstring .split ("|" )]), key = functools .cmp_to_key (loci_sort )))
432+ return "|" .join (sorted (set ([self .redux_gl (a , redux_type ) for a in glstring .split ("|" )]), key = functools .cmp_to_key (smart_sort_comparator )))
401433
402434 if re .search ("\+" , glstring ):
403- return "+" .join (sorted ([self .redux_gl (a , redux_type ) for a in glstring .split ("+" )], key = functools .cmp_to_key (loci_sort )))
435+ return "+" .join (sorted ([self .redux_gl (a , redux_type ) for a in glstring .split ("+" )], key = functools .cmp_to_key (smart_sort_comparator )))
404436
405437 if re .search ("\~" , glstring ):
406438 return "~" .join ([self .redux_gl (a , redux_type ) for a in glstring .split ("~" )])
407439
408440 if re .search ("/" , glstring ):
409- return "/" .join (sorted (set ([self .redux_gl (a , redux_type ) for a in glstring .split ("/" )]), key = functools .cmp_to_key (loci_sort )))
441+ return "/" .join (sorted (set ([self .redux_gl (a , redux_type ) for a in glstring .split ("/" )]), key = functools .cmp_to_key (smart_sort_comparator )))
410442
411443 loc_allele = glstring .split (":" )
412444 loc_name , code = loc_allele [0 ], loc_allele [1 ]
445+
446+ # handle XX codes
413447 if (ismac (glstring ) and glstring .split (":" )[1 ] == "XX" ):
414448 loc , n = loc_name .split ("*" )
415- return self .redux_gl ("/" .join (sorted (self .xxcodes [loc_name ], key = functools .cmp_to_key (loci_sort ))), redux_type )
449+ return self .redux_gl ("/" .join (sorted (self .xxcodes [loc_name ], key = functools .cmp_to_key (smart_sort_comparator ))), redux_type )
416450
417451 if ismac (glstring ) and code in self .mac :
418452 if re .search ("HLA-" , glstring ):
@@ -423,37 +457,39 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
423457 [loc_name + ":" + a if len (a ) <= 3
424458 else loc + "*" + a
425459 for a in self .mac [code ]['Alleles' ]]))
426- return self .redux_gl ("/" .join (sorted (["HLA-" + a for a in alleles ], key = functools .cmp_to_key (loci_sort ))), redux_type )
460+ return self .redux_gl ("/" .join (sorted (["HLA-" + a for a in alleles ], key = functools .cmp_to_key (smart_sort_comparator ))), redux_type )
427461 else :
428462 loc , n = loc_name .split ("*" )
429463 alleles = list (filter (lambda a : a in self .valid ,
430464 [loc_name + ":" + a if len (a ) <= 3
431465 else loc + "*" + a
432466 for a in self .mac [code ]['Alleles' ]]))
433- return self .redux_gl ("/" .join (sorted (alleles , key = functools .cmp_to_key (loci_sort ))), redux_type )
467+ return self .redux_gl ("/" .join (sorted (alleles , key = functools .cmp_to_key (smart_sort_comparator ))), redux_type )
434468 return self .redux (glstring , redux_type )
435469
436- def isvalid (self , allele : str ) -> str :
470+ def isvalid (self , allele : str ) -> bool :
437471 """
438472 Determines validity of an allele
439473
440474 :param allele: An HLA allele.
441475 :type: str
442476 :return: allele or empty
443- :rtype: boolean
477+ :rtype: bool
444478 """
445479 if not ismac (allele ):
446- return allele in self .valid
480+ # PERFORMANCE: use hash instead of allele in "list"
481+ # return allele in self.valid
482+ return self .valid_dict .get (allele , False )
447483 return True
448484
449- def isvalid_gl (self , glstring : str ) -> str :
485+ def isvalid_gl (self , glstring : str ) -> bool :
450486 """
451487 Determine validity of glstring
452488
453489 :param glstring
454490 :type: str
455491 :return: result
456- :rtype: boolean
492+ :rtype: bool
457493 """
458494
459495 if re .search ("\^" , glstring ):
0 commit comments