PnotG (#178)

mmaiers-nmdp · web-flow · commit 0952a160c245 · 2022-09-29T14:22:08.000-05:00
* works but not for G mode

* ping mode

* updated test to fix its behavior

* remove lambdas, add and remove comments

* fixed test expected results to new correct answer

* ping tests

* lint
diff --git a/pyard/data_repository.py b/pyard/data_repository.py
@@ -23,14 +23,13 @@
 from collections import namedtuple
 import functools
 import sqlite3
-
 import pandas as pd
 
 from . import db
 from .broad_splits import broad_splits_dna_mapping
 from .broad_splits import broad_splits_ser_mapping
 from .misc import get_2field_allele, get_3field_allele, number_of_fields
-from .misc import expression_chars
+from .misc import expression_chars, get_G_name, get_P_name
 
 # GitHub URL where IMGT HLA files are downloaded.
 from pyard.smart_sort import smart_sort_comparator
@@ -46,6 +45,7 @@
     "lgx_group",
     "exon_group",
     "p_group",
+    "p_not_g",
 ]
 ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)
 
@@ -102,6 +102,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
         p_group = db.load_dict(
             db_connection, table_name="p_group", columns=("allele", "p")
         )
+        p_not_g = db.load_dict(
+            db_connection, table_name="p_not_g", columns=("allele", "lgx")
+        )
         return ARSMapping(
             dup_g=dup_g,
             dup_lg=dup_lg,
@@ -111,13 +114,46 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
             lgx_group=lgx_group,
             exon_group=exon_group,
             p_group=p_group,
+            p_not_g=p_not_g,
         )
 
+    # load the hla_nom_g.txt
     ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt"
     df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
 
+    # the G-group is named for its first allele
+    df["G"] = df["A"].apply(get_G_name)
+
+    # load the hla_nom_p.txt
+    ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
+    # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
+    df_P = pd.read_csv(
+        ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
+    ).dropna()
+
+    # the P-group is named for its first allele
+    df_P["P"] = df_P["A"].apply(get_P_name)
+
+    # convert slash delimited string to a list
+    df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
+    df_P = df_P.explode("A")
+    # C* 06:06:01:01/06:06:01:02/06:271 06:06P
+    df_P["A"] = df_P["Locus"] + df_P["A"]
+    df_P["P"] = df_P["Locus"] + df_P["P"]
+    # C* 06:06:01:01 06:06P
+    # C* 06:06:01:02 06:06P
+    # C* 06:271 06:06P
+    p_group = df_P.set_index("A")["P"].to_dict()
+    df_P["2d"] = df_P["A"].apply(get_2field_allele)
+    # lgx has the P-group name without the P for comparison
+    df_P["lgx"] = df_P["P"].apply(get_2field_allele)
+
+    # convert slash delimited string to a list
     df["A"] = df["A"].apply(lambda a: a.split("/"))
+    # convert the list into separate rows for each element
     df = df.explode("A")
+
+    #  A*   + 02:01   = A*02:01
     df["A"] = df["Locus"] + df["A"]
     df["G"] = df["Locus"] + df["G"]
 
@@ -126,8 +162,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
     df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
     df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]))
 
+    # compare df_P["2d"] with df["2d"] to find 2-field alleles in the
+    # P-group that aren't in the G-group
+    PnotinG = set(df_P["2d"]) - set(df["2d"])
+
+    # filter to find these 2-field alleles (2d) in the P-group data frame
+    df_PnotG = df_P[df_P["2d"].isin(PnotinG)]
+
+    # dictionary which will define the table
+    p_not_g = df_PnotG.set_index("A")["lgx"].to_dict()
+
     # multiple Gs
+    # goal: identify 2-field alleles that are in multiple G-groups
+
+    # group by 2d and G, and select the 2d column and count the columns
     mg = df.drop_duplicates(["2d", "G"])["2d"].value_counts()
+    # filter out the mg with count > 1, leaving only duplicates
+    # take the index from the 2d version the data frame, make that a column
+    # and turn that into a list
     multiple_g_list = mg[mg > 1].reset_index()["index"].to_list()
 
     # Keep only the alleles that have more than 1 mapping
@@ -202,18 +254,13 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
     )
     exon_group = df_exon.set_index("A")["exon"].to_dict()
 
-    # P groups
-    ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
-    df_P = pd.read_csv(
-        ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
-    ).dropna()
-    df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
-    df_P = df_P.explode("A")
-    df_P["A"] = df_P["Locus"] + df_P["A"]
-    df_P["P"] = df_P["Locus"] + df_P["P"]
-    p_group = df_P.set_index("A")["P"].to_dict()
-
     # save
+    db.save_dict(
+        db_connection,
+        table_name="p_not_g",
+        dictionary=p_not_g,
+        columns=("allele", "lgx"),
+    )
     db.save_dict(
         db_connection,
         table_name="dup_g",
@@ -256,7 +303,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
     db.save_dict(
         db_connection,
         table_name="p_group",
-        dictionary=exon_group,
+        dictionary=p_group,
         columns=("allele", "p"),
     )
 
@@ -269,6 +316,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
         lgx_group=lgx_group,
         exon_group=exon_group,
         p_group=p_group,
+        p_not_g=p_not_g,
     )
 
 
diff --git a/pyard/misc.py b/pyard/misc.py
@@ -1,5 +1,7 @@
 # List of expression characters
 expression_chars = ["N", "Q", "L", "S"]
+# List of P and G characters
+PandG_chars = ["P", "G"]
 
 
 def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
@@ -20,12 +22,44 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
 
 
 def get_3field_allele(a: str) -> str:
+    last_char = a[-1]
+    if last_char in PandG_chars:
+        a = a[:-1]
+
     return get_n_field_allele(a, 3)
 
 
 def get_2field_allele(a: str) -> str:
+    last_char = a[-1]
+    if last_char in PandG_chars:
+        a = a[:-1]
     return get_n_field_allele(a, 2)
 
 
 def number_of_fields(allele: str) -> int:
     return len(allele.split(":"))
+
+
+# computes a valid G name based on the ambiguity string
+def get_G_name(a: str) -> str:
+    a = a.split("/")[0]
+    last_char = a[-1]
+    if last_char in PandG_chars + expression_chars:
+        a = a[:-1]
+    if len(a.split(":")) == 2:
+        return ":".join([a, "01"]) + "G"
+    else:
+        return ":".join(a.split(":")[0:3]) + "G"
+
+
+# computes a valid P name based on the ambiguity string
+def get_P_name(a: str) -> str:
+    a = a.split("/")[0]
+    last_char = a[-1]
+    if last_char in PandG_chars + expression_chars:
+        a = a[:-1]
+    return ":".join(a.split(":")[0:2]) + "P"
+
+
+def number_of_fields(allele: str) -> int:
+    return len(allele.split(":"))
diff --git a/pyard/pyard.py b/pyard/pyard.py
@@ -48,6 +48,7 @@
     "reduce_XX": True,
     "reduce_MAC": True,
     "reduce_shortnull": True,
+    "ping": False,
     "map_drb345_to_drbx": True,
     "verbose_log": True,
 }
@@ -140,7 +141,7 @@ def __del__(self):
         self.db_connection.close()
 
     @functools.lru_cache(maxsize=max_cache_size)
-    def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
+    def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> str:
         """
         Does ARS reduction with allele and ARS type
 
@@ -172,6 +173,13 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
         if allele.endswith(("P", "G")):
             if redux_type in ["lg", "lgx", "G"]:
                 allele = allele[:-1]
+        if self._config["ping"] and reping:
+            if redux_type in ("lg", "lgx", "U2"):
+                if allele in self.ars_mappings.p_not_g:
+                    return self.ars_mappings.p_not_g[allele]
+                else:
+                    return self.redux(allele, redux_type, False)
+
         if redux_type == "G" and allele in self.ars_mappings.g_group:
             if allele in self.ars_mappings.dup_g:
                 return self.ars_mappings.dup_g[allele]
diff --git a/tests/environment.py b/tests/environment.py
@@ -3,3 +3,9 @@
 
 def before_all(context):
     context.ard = ARD("3440", data_dir="/tmp/py-ard")
+
+    # an ard with ping set to True
+    my_config = {
+        "ping": True,
+    }
+    context.ard_ping = ARD("3440", data_dir="/tmp/py-ard", config=my_config)
diff --git a/tests/features/allele.feature b/tests/features/allele.feature
@@ -1,6 +1,20 @@
 Feature: Alleles
 
-  Scenario Outline:
+  Scenario Outline: allele reduction with ping
+
+    Given the allele as <Allele>
+    When reducing on the <Level> level with ping
+    Then the reduced allele is found to be <Redux Allele>
+
+    Examples:
+      | Allele         | Level | Redux Allele      |
+      | C*02:02        | lg    | C*02:02g          |
+      | C*02:02        | lgx   | C*02:02           |
+      | C*02:10        | lg    | C*02:02g          |
+      | C*02:10        | lgx   | C*02:02           |
+      | C*06:17        | lgx   | C*06:02           |
+
+  Scenario Outline: allele reduction
 
     Given the allele as <Allele>
     When reducing on the <Level> level
@@ -21,5 +35,8 @@ Feature: Alleles
 
       | DRB1*14:06:01  | lgx   | DRB1*14:06        |
       | DRB1*14:06:01  | lg    | DRB1*14:06g       |
-      | C*02:02        | lg    | C*02:02g/C*02:10g |
-      | C*02:02        | lgx   | C*02:02/C*02:10   |
+      | C*02:02        | lg    | C*02:02g          |
+      | C*02:02        | lgx   | C*02:02           |
+      | C*02:10        | lg    | C*02:02g          |
+      | C*02:10        | lgx   | C*02:02           |
+      | C*06:17        | lgx   | C*06:17           |
diff --git a/tests/steps/redux_allele.py b/tests/steps/redux_allele.py
@@ -15,6 +15,12 @@ def step_impl(context, level):
     context.redux_allele = context.ard.redux(context.allele, level)
 
 
+@when("reducing on the {level} level with ping")
+def step_impl(context, level):
+    context.level = level
+    context.redux_allele = context.ard_ping.redux(context.allele, level)
+
+
 @when("reducing on the {level} level (ambiguous)")
 def step_impl(context, level):
     context.level = level
diff --git a/tests/test_pyard.py b/tests/test_pyard.py
@@ -156,4 +156,4 @@ def test_allele_duplicated(self):
         # https://github.com/nmdp-bioinformatics/py-ard/issues/135
         allele_code = "C*02:ACMGS"
         allele_code_rx = self.ard.redux_gl(allele_code, "lgx")
-        self.assertEqual(allele_code_rx, "C*02:02/C*02:10")
+        self.assertEqual(allele_code_rx, "C*02:02")