Skip to content

Commit 7983a79

Browse files
authored
Broad XX enhancement and performance improvements (#32)
* handle broad XX codes * relshp file * packaging * performance * performance code clinic * gitignore and performance enhancements to pyard.py
1 parent 9488359 commit 7983a79

File tree

6 files changed

+180
-24
lines changed

6 files changed

+180
-24
lines changed

.gitignore

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,3 +99,8 @@ ENV/
9999

100100
# mypy
101101
.mypy_cache/
102+
103+
# downloaded
104+
*.txt
105+
*.pickle
106+
*.zip

MANIFEST.in

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
11

22
include AUTHORS.rst
3-
43
include CONTRIBUTING.rst
54
include HISTORY.rst
65
include LICENSE
76
include README.rst
7+
include pyard/*.csv
88

99
recursive-include tests *
1010
recursive-exclude * __pycache__
1111
recursive-exclude * *.py[co]
1212

13-
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
13+
recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.csv

pyard/dna_relshp.csv

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
loc,broad_fam,fam
2+
A,09,23
3+
A,09,24
4+
A,10,25
5+
A,10,26
6+
A,10,34
7+
A,10,66
8+
A,19,29
9+
A,19,30
10+
A,19,31
11+
A,19,32
12+
A,19,33
13+
A,19,74
14+
A,28,68
15+
A,28,69
16+
B,05,51
17+
B,05,52
18+
B,12,44
19+
B,12,45
20+
B,16,38
21+
B,16,39
22+
B,17,57
23+
B,17,58
24+
B,21,49
25+
B,21,50
26+
B,22,54
27+
B,22,55
28+
B,22,56
29+
DQB1,01,05
30+
DQB1,01,06
31+
DRB1,02,15
32+
DRB1,02,16
33+
DRB1,06,13
34+
DRB1,06,14

pyard/pyard.py

Lines changed: 57 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# -*- coding: utf-8 -*-
22

33
#
4-
# pyars pyARS.
5-
# Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
4+
# pyard
5+
# Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
66
#
77
# This library is free software; you can redistribute it and/or modify it
88
# under the terms of the GNU Lesser General Public License as published
@@ -26,6 +26,8 @@
2626
import pickle
2727
import urllib.request
2828
import pandas as pd
29+
import functools
30+
from .smart_sort import smart_sort_comparator
2931
from .util import pandas_explode
3032
from .util import all_macs
3133
from operator import is_not
@@ -37,9 +39,9 @@
3739
ismac = lambda x: True if re.search(":\D+", x) else False
3840

3941

40-
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
41-
datefmt='%m/%d/%Y %I:%M:%S %p',
42-
level=logging.INFO)
42+
# a module shouldn't decide the logging config; thats up to the calling programo
43+
44+
#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
4345

4446
import string
4547

@@ -108,6 +110,8 @@ def __init__(self, dbversion: str='Latest',
108110
self._download_mac = download_mac
109111
self._remove_invalid = remove_invalid
110112

113+
self.HLA_regex = re.compile("^HLA-")
114+
111115
# TODO: add check for valid ARD type
112116
# TODO: add check for valid db version
113117

@@ -120,6 +124,7 @@ def __init__(self, dbversion: str='Latest',
120124
allele_file = data_dir + '/AlleleList.' + str(dbversion) + ".txt"
121125
mac_file = data_dir + "/mac.txt"
122126
mac_pickle = data_dir + "/mac.pickle"
127+
broad_file = data_dir + "/dna_relshp.csv"
123128

124129
allele_url = "https://raw.githubusercontent.com/ANHIG/IMGTHLA/" \
125130
+ dbversion + "/Allelelist.txt"
@@ -184,20 +189,42 @@ def __init__(self, dbversion: str='Latest',
184189
dfxx = pd.DataFrame(pd.Series(allele_df['2d'].unique().tolist()),
185190
columns=['Allele'])
186191
dfxx['1d'] = dfxx['Allele'].apply(lambda x: x.split(":")[0])
192+
193+
# xxcodes maps a first field name to its expansion
187194
self.xxcodes = dfxx.groupby(['1d'])\
188195
.apply(lambda x: list(x['Allele']))\
189196
.to_dict()
190197

198+
# defined broad XX codes
199+
dfbroad = pd.read_csv(broad_file, skiprows=1, dtype=str,
200+
names=["Locus", "Broad", "Fam"], sep=",").dropna()
201+
202+
dictbroad = dfbroad.groupby(['Locus','Broad']).apply(lambda x: list(x['Fam'])).to_dict()
203+
204+
for (locus,broad) in dictbroad.keys():
205+
locusbroad="*".join([locus,broad])
206+
for split in dictbroad[(locus,broad)]:
207+
locussplit="*".join([locus,split])
208+
if locusbroad in self.xxcodes.keys():
209+
self.xxcodes[locusbroad].extend(self.xxcodes[locussplit])
210+
else:
211+
self.xxcodes[locusbroad] = self.xxcodes[locussplit]
212+
191213
allele_df['3d'] = allele_df['Allele'].apply(lambda a:
192214
":".join(a.split(":")[0:3]) +
193215
list(a)[-1] if list(a)[-1]
194216
in expre_chars and
195217
len(a.split(":")) > 3
196218
else ":".join(a.split(":")[0:3]))
197219

220+
# all alleles are valid and also shortening to 3 and 2 fields
198221
self.valid = list(set(allele_df['Allele'].tolist()
199222
+ allele_df['2d'].tolist()
200223
+ allele_df['3d'].tolist()))
224+
# use a dict
225+
self.valid_dict={}
226+
for i in self.valid:
227+
self.valid_dict[i]=True
201228

202229
# Loading ARS file into pandas
203230
# TODO: Make skip dynamic in case the files are not consistent
@@ -344,6 +371,7 @@ def lgx(self):
344371
"""
345372
return self._lgx
346373

374+
@functools.lru_cache(maxsize=None)
347375
def redux(self, allele: str, ars_type: str) -> str:
348376
"""
349377
Does ARS reduction with allele and ARS type
@@ -356,18 +384,21 @@ def redux(self, allele: str, ars_type: str) -> str:
356384
:rtype: str
357385
"""
358386

359-
if re.search("HLA-", allele):
387+
# PERFORMANCE: precompiled regex
388+
# dealing with leading HLA-
389+
390+
if self.HLA_regex.search(allele):
360391
hla, allele_name = allele.split("-")
361392
return "-".join(["HLA", self.redux(allele_name, ars_type)])
362393

363-
if ars_type == "G" and allele in self.G:
394+
if ars_type == "G" and allele in self._G:
364395
if allele in self.dup_g:
365396
return self.dup_g[allele]
366397
else:
367398
return self.G[allele]
368-
elif ars_type == "lg" and allele in self.lg:
399+
elif ars_type == "lg" and allele in self._lg:
369400
return self.lg[allele]
370-
elif ars_type == "lgx" and allele in self.lgx:
401+
elif ars_type == "lgx" and allele in self._lgx:
371402
return self.lgx[allele]
372403
else:
373404
if self.remove_invalid:
@@ -378,6 +409,7 @@ def redux(self, allele: str, ars_type: str) -> str:
378409
else:
379410
return allele
380411

412+
@functools.lru_cache(maxsize=None)
381413
def redux_gl(self, glstring: str, redux_type: str) -> str:
382414
"""
383415
Does ARS reduction with allele and ARS type
@@ -394,25 +426,27 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
394426
return ""
395427

396428
if re.search("\^", glstring):
397-
return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(loci_sort)))
429+
return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(smart_sort_comparator)))
398430

399431
if re.search("\|", glstring):
400-
return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(loci_sort)))
432+
return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(smart_sort_comparator)))
401433

402434
if re.search("\+", glstring):
403-
return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(loci_sort)))
435+
return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(smart_sort_comparator)))
404436

405437
if re.search("\~", glstring):
406438
return "~".join([self.redux_gl(a, redux_type) for a in glstring.split("~")])
407439

408440
if re.search("/", glstring):
409-
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(loci_sort)))
441+
return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator)))
410442

411443
loc_allele = glstring.split(":")
412444
loc_name, code = loc_allele[0], loc_allele[1]
445+
446+
# handle XX codes
413447
if(ismac(glstring) and glstring.split(":")[1] == "XX"):
414448
loc, n = loc_name.split("*")
415-
return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(loci_sort))), redux_type)
449+
return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
416450

417451
if ismac(glstring) and code in self.mac:
418452
if re.search("HLA-", glstring):
@@ -423,37 +457,39 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
423457
[loc_name + ":" + a if len(a) <= 3
424458
else loc + "*" + a
425459
for a in self.mac[code]['Alleles']]))
426-
return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(loci_sort))), redux_type)
460+
return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
427461
else:
428462
loc, n = loc_name.split("*")
429463
alleles = list(filter(lambda a: a in self.valid,
430464
[loc_name + ":" + a if len(a) <= 3
431465
else loc + "*" + a
432466
for a in self.mac[code]['Alleles']]))
433-
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(loci_sort))), redux_type)
467+
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
434468
return self.redux(glstring, redux_type)
435469

436-
def isvalid(self, allele: str) -> str:
470+
def isvalid(self, allele: str) -> bool:
437471
"""
438472
Determines validity of an allele
439473
440474
:param allele: An HLA allele.
441475
:type: str
442476
:return: allele or empty
443-
:rtype: boolean
477+
:rtype: bool
444478
"""
445479
if not ismac(allele):
446-
return allele in self.valid
480+
# PERFORMANCE: use hash instead of allele in "list"
481+
# return allele in self.valid
482+
return self.valid_dict.get(allele, False)
447483
return True
448484

449-
def isvalid_gl(self, glstring: str) -> str:
485+
def isvalid_gl(self, glstring: str) -> bool:
450486
"""
451487
Determine validity of glstring
452488
453489
:param glstring
454490
:type: str
455491
:return: result
456-
:rtype: boolean
492+
:rtype: bool
457493
"""
458494

459495
if re.search("\^", glstring):

pyard/smart_sort.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,80 @@
1+
import functools
2+
import re
3+
4+
expr_regex = re.compile('[NQLS]')
5+
6+
@functools.lru_cache(maxsize=None)
7+
def smart_sort_comparator(a1, a2):
8+
"""
9+
Natural sort 2 given alleles.
10+
11+
Python sorts strings lexographically but HLA alleles need
12+
to be sorted by numerical values in each field of the HLA nomenclature.
13+
14+
:param a1: first allele
15+
:param a2: second allele
16+
"""
17+
18+
# Check to see if they are the same alleles
19+
if a1 == a2:
20+
return 0
21+
22+
23+
# remove any non-numerics
24+
a1 = re.sub(expr_regex, '', a1)
25+
a2 = re.sub(expr_regex, '', a2)
26+
# Extract and Compare first fields first
27+
a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
28+
a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
29+
30+
if a1_f1 < a2_f1:
31+
return -1
32+
if a1_f1 > a2_f1:
33+
return 1
34+
35+
# If the first fields are equal, try the 2nd fields
36+
a1_f2 = int(a1[a1.find(':')+1:])
37+
a2_f2 = int(a2[a2.find(':')+1:])
38+
39+
if a1_f2 < a2_f2:
40+
return -1
41+
if a1_f2 > a2_f2:
42+
return 1
43+
44+
# All fields are equal
45+
return 0
46+
47+
def smart_sort_alleles(a1, a2):
48+
"""
49+
Natural sort 2 given alleles.
50+
51+
Python sorts strings lexographically but HLA alleles need
52+
to be sorted by numerical values in each field of the HLA nomenclature.
53+
54+
:param a1: first allele
55+
:param a2: second allele
56+
"""
57+
# Check to see if they are the same alleles
58+
if a1 == a2:
59+
return [a1, a2]
60+
61+
# Extract and Compare first fields first
62+
a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
63+
a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
64+
65+
if a1_f1 < a2_f1:
66+
return [a1, a2]
67+
if a1_f1 > a2_f1:
68+
return [a2, a1]
69+
70+
# If the first fields are equal, try the 2nd fields
71+
a1_f2 = int(a1[a1.find(':')+1:])
72+
a2_f2 = int(a2[a2.find(':')+1:])
73+
74+
if a1_f2 < a2_f2:
75+
return [a1, a2]
76+
if a1_f2 > a2_f2:
77+
return [a2, a1]
78+
79+
# All fields are equal
80+
return [a1, a2]

setup.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,5 +65,6 @@
6565
'Programming Language :: Python :: 3.7',
6666
],
6767
test_suite='tests',
68-
tests_require=test_requirements
68+
tests_require=test_requirements,
69+
include_package_data=True
6970
)

0 commit comments

Comments
 (0)