Broad XX enhancement and performance improvements (#32)

mmaiers-nmdp · web-flow · commit 7983a79bfcdc · 2020-03-10T08:53:52.000-05:00
* handle broad XX codes

* relshp file

* packaging

* performance

* performance code clinic

* gitignore and performance enhancements to pyard.py
diff --git a/.gitignore b/.gitignore
@@ -99,3 +99,8 @@ ENV/
 
 # mypy
 .mypy_cache/
+
+# downloaded
+*.txt 
+*.pickle
+*.zip
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,13 +1,13 @@
 
 include AUTHORS.rst
-
 include CONTRIBUTING.rst
 include HISTORY.rst
 include LICENSE
 include README.rst
+include pyard/*.csv
 
 recursive-include tests *
 recursive-exclude * __pycache__
 recursive-exclude * *.py[co]
 
-recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif
+recursive-include docs *.rst conf.py Makefile make.bat *.jpg *.png *.gif *.csv
diff --git a/pyard/dna_relshp.csv b/pyard/dna_relshp.csv
@@ -0,0 +1,34 @@
+loc,broad_fam,fam
+A,09,23
+A,09,24
+A,10,25
+A,10,26
+A,10,34
+A,10,66
+A,19,29
+A,19,30
+A,19,31
+A,19,32
+A,19,33
+A,19,74
+A,28,68
+A,28,69
+B,05,51
+B,05,52
+B,12,44
+B,12,45
+B,16,38
+B,16,39
+B,17,57
+B,17,58
+B,21,49
+B,21,50
+B,22,54
+B,22,55
+B,22,56
+DQB1,01,05
+DQB1,01,06
+DRB1,02,15
+DRB1,02,16
+DRB1,06,13
+DRB1,06,14
diff --git a/pyard/pyard.py b/pyard/pyard.py
@@ -1,8 +1,8 @@
 # -*- coding: utf-8 -*-
 
 #
-#    pyars pyARS.
-#    Copyright (c) 2018 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
+#    pyard
+#    Copyright (c) 2020 Be The Match operated by National Marrow Donor Program. All Rights Reserved.
 #
 #    This library is free software; you can redistribute it and/or modify it
 #    under the terms of the GNU Lesser General Public License as published
@@ -26,6 +26,8 @@
 import pickle
 import urllib.request
 import pandas as pd
+import functools
+from .smart_sort import smart_sort_comparator
 from .util import pandas_explode
 from .util import all_macs
 from operator import is_not
@@ -37,9 +39,9 @@
 ismac = lambda x: True if re.search(":\D+", x) else False
 
 
-logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
-                    datefmt='%m/%d/%Y %I:%M:%S %p',
-                    level=logging.INFO)
+# a module shouldn't decide the logging config; thats up to the calling programo
+
+#logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', datefmt='%m/%d/%Y %I:%M:%S %p', level=logging.INFO)
 
 import string
 
@@ -108,6 +110,8 @@ def __init__(self, dbversion: str='Latest',
         self._download_mac = download_mac
         self._remove_invalid = remove_invalid
 
+        self.HLA_regex = re.compile("^HLA-")
+
         # TODO: add check for valid ARD type
         # TODO: add check for valid db version
 
@@ -120,6 +124,7 @@ def __init__(self, dbversion: str='Latest',
         allele_file = data_dir + '/AlleleList.' + str(dbversion) + ".txt"
         mac_file = data_dir + "/mac.txt"
         mac_pickle = data_dir + "/mac.pickle"
+        broad_file = data_dir + "/dna_relshp.csv"
 
         allele_url = "https://raw.githubusercontent.com/ANHIG/IMGTHLA/" \
                      + dbversion + "/Allelelist.txt"
@@ -184,20 +189,42 @@ def __init__(self, dbversion: str='Latest',
         dfxx = pd.DataFrame(pd.Series(allele_df['2d'].unique().tolist()),
                                       columns=['Allele'])
         dfxx['1d'] = dfxx['Allele'].apply(lambda x: x.split(":")[0])
+    
+        # xxcodes maps a first field name to its expansion
         self.xxcodes = dfxx.groupby(['1d'])\
                            .apply(lambda x: list(x['Allele']))\
                            .to_dict()
 
+        # defined broad XX codes
+        dfbroad = pd.read_csv(broad_file, skiprows=1, dtype=str,
+                         names=["Locus", "Broad", "Fam"], sep=",").dropna()
+
+        dictbroad = dfbroad.groupby(['Locus','Broad']).apply(lambda x: list(x['Fam'])).to_dict()
+
+        for (locus,broad) in dictbroad.keys():
+            locusbroad="*".join([locus,broad])  
+            for split in dictbroad[(locus,broad)]:
+                locussplit="*".join([locus,split])
+                if locusbroad in self.xxcodes.keys():
+                    self.xxcodes[locusbroad].extend(self.xxcodes[locussplit])
+                else:
+                    self.xxcodes[locusbroad] = self.xxcodes[locussplit]
+
         allele_df['3d'] = allele_df['Allele'].apply(lambda a:
                                  ":".join(a.split(":")[0:3]) +
                                  list(a)[-1] if list(a)[-1]
                                  in expre_chars and
                                  len(a.split(":")) > 3
                                  else ":".join(a.split(":")[0:3]))
 
+        # all alleles are valid and also shortening to 3 and 2 fields
         self.valid = list(set(allele_df['Allele'].tolist()
                               + allele_df['2d'].tolist()
                               + allele_df['3d'].tolist()))
+        # use a dict
+        self.valid_dict={}
+        for i in self.valid:
+            self.valid_dict[i]=True
 
         # Loading ARS file into pandas
         # TODO: Make skip dynamic in case the files are not consistent
@@ -344,6 +371,7 @@ def lgx(self):
         """
         return self._lgx
 
+    @functools.lru_cache(maxsize=None)
     def redux(self, allele: str, ars_type: str) -> str:
         """
         Does ARS reduction with allele and ARS type
@@ -356,18 +384,21 @@ def redux(self, allele: str, ars_type: str) -> str:
         :rtype: str
         """
 
-        if re.search("HLA-", allele):
+        # PERFORMANCE: precompiled regex
+        # dealing with leading HLA-
+
+        if self.HLA_regex.search(allele):
             hla, allele_name = allele.split("-")
             return "-".join(["HLA", self.redux(allele_name, ars_type)])
 
-        if ars_type == "G" and allele in self.G:
+        if ars_type == "G" and allele in self._G:
             if allele in self.dup_g:
                 return self.dup_g[allele]
             else:
                 return self.G[allele]
-        elif ars_type == "lg" and allele in self.lg:
+        elif ars_type == "lg" and allele in self._lg:
             return self.lg[allele]
-        elif ars_type == "lgx" and allele in self.lgx:
+        elif ars_type == "lgx" and allele in self._lgx:
             return self.lgx[allele]
         else:
             if self.remove_invalid:
@@ -378,6 +409,7 @@ def redux(self, allele: str, ars_type: str) -> str:
             else:
                 return allele
 
+    @functools.lru_cache(maxsize=None)
     def redux_gl(self, glstring: str, redux_type: str) -> str:
         """
         Does ARS reduction with allele and ARS type
@@ -394,25 +426,27 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
             return ""
 
         if re.search("\^", glstring):
-            return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(loci_sort)))
+            return "^".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("^")]), key=functools.cmp_to_key(smart_sort_comparator)))
 
         if re.search("\|", glstring):
-            return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(loci_sort)))
+            return "|".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("|")]), key=functools.cmp_to_key(smart_sort_comparator)))
 
         if re.search("\+", glstring):
-            return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(loci_sort)))
+            return "+".join(sorted([self.redux_gl(a, redux_type) for a in glstring.split("+")], key=functools.cmp_to_key(smart_sort_comparator)))
 
         if re.search("\~", glstring):
             return "~".join([self.redux_gl(a, redux_type) for a in glstring.split("~")])
 
         if re.search("/", glstring):
-            return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(loci_sort)))
+            return "/".join(sorted(set([self.redux_gl(a, redux_type) for a in glstring.split("/")]), key=functools.cmp_to_key(smart_sort_comparator)))
 
         loc_allele = glstring.split(":")
         loc_name, code = loc_allele[0], loc_allele[1]
+       
+        # handle XX codes
         if(ismac(glstring) and glstring.split(":")[1] == "XX"):
             loc, n = loc_name.split("*")
-            return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(loci_sort))), redux_type)
+            return self.redux_gl("/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
 
         if ismac(glstring) and code in self.mac:
             if re.search("HLA-", glstring):
@@ -423,37 +457,39 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
                                       [loc_name + ":" + a if len(a) <= 3
                                        else loc + "*" + a
                                        for a in self.mac[code]['Alleles']]))
-                return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(loci_sort))), redux_type)
+                return self.redux_gl("/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
             else:
                 loc, n = loc_name.split("*")
                 alleles = list(filter(lambda a: a in self.valid,
                                       [loc_name + ":" + a if len(a) <= 3
                                        else loc + "*" + a
                                        for a in self.mac[code]['Alleles']]))
-                return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(loci_sort))), redux_type)
+                return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
         return self.redux(glstring, redux_type)
 
-    def isvalid(self, allele: str) -> str:
+    def isvalid(self, allele: str) -> bool:
         """
         Determines validity of an allele
 
         :param allele: An HLA allele.
         :type: str
         :return: allele or empty
-        :rtype: boolean
+        :rtype: bool
         """
         if not ismac(allele):
-            return allele in self.valid
+            # PERFORMANCE: use hash instead of allele in "list"
+            # return allele in self.valid
+            return self.valid_dict.get(allele, False)
         return True
 
-    def isvalid_gl(self, glstring: str) -> str:
+    def isvalid_gl(self, glstring: str) -> bool:
         """
         Determine validity of glstring
 
         :param glstring
         :type: str
         :return: result
-        :rtype: boolean
+        :rtype: bool
         """
         
         if re.search("\^", glstring):
diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py
@@ -0,0 +1,80 @@
+import functools
+import re
+
+expr_regex = re.compile('[NQLS]')
+
+@functools.lru_cache(maxsize=None)
+def smart_sort_comparator(a1, a2):
+    """
+    Natural sort 2 given alleles.
+
+    Python sorts strings lexographically but HLA alleles need
+    to be sorted by numerical values in each field of the HLA nomenclature.
+
+    :param a1: first allele
+    :param a2: second allele
+    """
+
+    # Check to see if they are the same alleles
+    if a1 == a2:
+        return 0
+
+
+    # remove any non-numerics
+    a1 = re.sub(expr_regex, '', a1)
+    a2 = re.sub(expr_regex, '', a2)
+    # Extract and Compare first fields first
+    a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
+    a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
+
+    if a1_f1 < a2_f1:
+        return -1
+    if a1_f1 > a2_f1:
+        return 1
+
+    # If the first fields are equal, try the 2nd fields
+    a1_f2 = int(a1[a1.find(':')+1:])
+    a2_f2 = int(a2[a2.find(':')+1:])
+
+    if a1_f2 < a2_f2:
+        return -1
+    if a1_f2 > a2_f2:
+        return 1
+
+    # All fields are equal
+    return 0
+
+def smart_sort_alleles(a1, a2):
+    """
+    Natural sort 2 given alleles.
+
+    Python sorts strings lexographically but HLA alleles need
+    to be sorted by numerical values in each field of the HLA nomenclature.
+
+    :param a1: first allele
+    :param a2: second allele
+    """
+    # Check to see if they are the same alleles
+    if a1 == a2:
+        return [a1, a2]
+
+    # Extract and Compare first fields first
+    a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
+    a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
+
+    if a1_f1 < a2_f1:
+        return [a1, a2]
+    if a1_f1 > a2_f1:
+        return [a2, a1]
+
+    # If the first fields are equal, try the 2nd fields
+    a1_f2 = int(a1[a1.find(':')+1:])
+    a2_f2 = int(a2[a2.find(':')+1:])
+
+    if a1_f2 < a2_f2:
+        return [a1, a2]
+    if a1_f2 > a2_f2:
+        return [a2, a1]
+
+    # All fields are equal
+    return [a1, a2]
diff --git a/setup.py b/setup.py
@@ -65,5 +65,6 @@
         'Programming Language :: Python :: 3.7',
     ],
     test_suite='tests',
-    tests_require=test_requirements
+    tests_require=test_requirements,
+    include_package_data=True
 )

Original file line number	Diff line number	Diff line change
`@@ -65,5 +65,6 @@`
`65`	`65`	`'Programming Language :: Python :: 3.7',`
`66`	`66`	`],`
`67`	`67`	`test_suite='tests',`
`68`		`- tests_require=test_requirements`
	`68`	`+ tests_require=test_requirements,`
	`69`	`+ include_package_data=True`
`69`	`70`	`)`