Smart Sort Fix and cleanup

pbashyal-nmdp · pbashyal-nmdp · commit 937469c47824 · 2020-09-08T12:31:31.000-05:00
- Fix 4th field comparison bug
 - Remove unused function `smart_sort_alleles`
 - cleanup
 - upped version to `0.0.21`
diff --git a/pyard/smart_sort.py b/pyard/smart_sort.py
@@ -3,12 +3,13 @@
 
 expr_regex = re.compile('[NQLSGg]')
 
+
 @functools.lru_cache(maxsize=None)
 def smart_sort_comparator(a1, a2):
     """
     Natural sort 2 given alleles.
 
-    Python sorts strings lexographically but HLA alleles need
+    Python sorts strings lexicographically but HLA alleles need
     to be sorted by numerical values in each field of the HLA nomenclature.
 
     :param a1: first allele
@@ -19,85 +20,53 @@ def smart_sort_comparator(a1, a2):
     if a1 == a2:
         return 0
 
-
     # remove any non-numerics
     a1 = re.sub(expr_regex, '', a1)
     a2 = re.sub(expr_regex, '', a2)
+
+    # Check to see if they are still the same alleles
+    if a1 == a2:
+        return 0
+
     # Extract and Compare first fields first
-    a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
-    a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
+    a1_f1 = int(a1[a1.find('*') + 1:a1.find(':')])
+    a2_f1 = int(a2[a2.find('*') + 1:a2.find(':')])
 
     if a1_f1 < a2_f1:
         return -1
     if a1_f1 > a2_f1:
         return 1
 
-    # If the first fields are equal, try the 2nd fields
+    a1_fields = a1.split(':')
+    a2_fields = a2.split(':')
 
-    a1_f2 = int(a1.split(':')[1])
-    a2_f2 = int(a2.split(':')[1])
+    # If the first fields are equal, try the 2nd fields
+    a1_f2 = int(a1_fields[1])
+    a2_f2 = int(a2_fields[1])
 
     if a1_f2 < a2_f2:
         return -1
     if a1_f2 > a2_f2:
         return 1
 
-    # If the two fields are equal, try the 3rd fields
-
-    a1_f3 = int(a1.split(':')[2])
-    a2_f3 = int(a2.split(':')[2])
+    # If the second fields are equal, try the 3rd fields
+    a1_f3 = int(a1_fields[2])
+    a2_f3 = int(a2_fields[2])
 
     if a1_f3 < a2_f3:
         return -1
     if a1_f3 > a2_f3:
         return 1
 
-    # If the two fields are equal, try the 4th fields
-
-    a1_f4 = int(a1.split(':')[3])
-    a2_f3 = int(a2.split(':')[3])
+    # If the third fields are equal, try the 4th fields
+    a1_f4 = int(a1_fields[3])
+    a2_f4 = int(a2_fields[3])
 
     if a1_f4 < a2_f4:
         return -1
     if a1_f4 > a2_f4:
         return 1
 
-
-    
-    # All fields are equal
+    # All fields are considered equal after 4th field
     return 0
 
-def smart_sort_alleles(a1, a2):
-    """
-    Natural sort 2 given alleles.
-
-    Python sorts strings lexographically but HLA alleles need
-    to be sorted by numerical values in each field of the HLA nomenclature.
-
-    :param a1: first allele
-    :param a2: second allele
-    """
-    # Check to see if they are the same alleles
-    if a1 == a2:
-        return [a1, a2]
-
-    # Extract and Compare first fields first
-    a1_f1 = int(a1[a1.find('*')+1:a1.find(':')])
-    a2_f1 = int(a2[a2.find('*')+1:a2.find(':')])
-
-    if a1_f1 < a2_f1:
-        return [a1, a2]
-    if a1_f1 > a2_f1:
-        return [a2, a1]
-
-    # If the first fields are equal, try the 2nd fields
-    a1_f2 = int(a1[a1.find(':')+1:])
-    a2_f2 = int(a2[a2.find(':')+1:])
-
-    if a1_f2 < a2_f2:
-        return [a1, a2]
-    if a1_f2 > a2_f2:
-        return [a2, a1]
-
-    # All fields are equal
-    return [a1, a2]
diff --git a/setup.py b/setup.py
@@ -42,7 +42,7 @@
 
 setup(
     name='py-ard',
-    version='0.0.20',
+    version='0.0.21',
     description="ARD reduction for HLA with python",
     long_description=readme + '\n\n' + history,
     author="CIBMTR",
diff --git a/tests/test_smart_sort.py b/tests/test_smart_sort.py
@@ -0,0 +1,89 @@
+import unittest
+
+from pyard.smart_sort import smart_sort_comparator
+
+
+class TestSmartSort(unittest.TestCase):
+
+    def setUp(self) -> None:
+        super().setUp()
+
+    def test_same_comparator(self):
+        allele = "HLA-A*01:01"
+        self.assertEqual(smart_sort_comparator(allele, allele), 0)
+
+    def test_equal_comparator(self):
+        allele1 = "HLA-A*01:01"
+        allele2 = "HLA-A*01:01"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 0)
+
+    def test_equal_comparator_G(self):
+        # Should compare without G
+        allele1 = "HLA-A*01:01G"
+        allele2 = "HLA-A*01:01"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 0)
+
+    def test_equal_comparator_NG(self):
+        # Should compare without N and G
+        allele1 = "HLA-A*01:01G"
+        allele2 = "HLA-A*01:01N"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 0)
+
+    def test_first_field_comparator_le(self):
+        allele1 = "HLA-A*01:01"
+        allele2 = "HLA-A*02:01"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_first_field_comparator_ge(self):
+        allele1 = "HLA-A*02:01"
+        allele2 = "HLA-A*01:01"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 1)
+
+    def test_second_field_comparator_le(self):
+        allele1 = "HLA-A*01:01"
+        allele2 = "HLA-A*01:02"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_second_field_comparator_le_smart(self):
+        allele1 = "HLA-A*01:29"
+        allele2 = "HLA-A*01:100"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_second_field_comparator_ge(self):
+        allele1 = "HLA-A*01:02"
+        allele2 = "HLA-A*01:01"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 1)
+
+    def test_third_field_comparator_le(self):
+        allele1 = "HLA-A*01:01:01"
+        allele2 = "HLA-A*01:01:20"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_third_field_comparator_le_smart(self):
+        allele1 = "HLA-A*01:01:29"
+        allele2 = "HLA-A*01:01:100"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_third_field_comparator_ge(self):
+        allele1 = "HLA-A*01:01:02"
+        allele2 = "HLA-A*01:01:01"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 1)
+
+    def test_fourth_field_comparator_le(self):
+        allele1 = "HLA-A*01:01:01:01"
+        allele2 = "HLA-A*01:01:01:20"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_fourth_field_comparator_le_smart(self):
+        allele1 = "HLA-A*01:01:01:39"
+        allele2 = "HLA-A*01:01:01:200"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), -1)
+
+    def test_fourth_field_comparator_ge(self):
+        allele1 = "HLA-A*01:01:01:30"
+        allele2 = "HLA-A*01:01:01:09"
+        self.assertEqual(smart_sort_comparator(allele1, allele2), 1)
+
+
+if __name__ == '__main__':
+    unittest.main()