Skip to content

Commit 0952a16

Browse files
authored
PnotG (#178)
* works but not for G mode * ping mode * updated test to fix its behavior * remove lambdas, add and remove comments * fixed test expected results to new correct answer * ping tests * lint
1 parent 47e2529 commit 0952a16

File tree

7 files changed

+138
-19
lines changed

7 files changed

+138
-19
lines changed

pyard/data_repository.py

Lines changed: 62 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,13 @@
2323
from collections import namedtuple
2424
import functools
2525
import sqlite3
26-
2726
import pandas as pd
2827

2928
from . import db
3029
from .broad_splits import broad_splits_dna_mapping
3130
from .broad_splits import broad_splits_ser_mapping
3231
from .misc import get_2field_allele, get_3field_allele, number_of_fields
33-
from .misc import expression_chars
32+
from .misc import expression_chars, get_G_name, get_P_name
3433

3534
# GitHub URL where IMGT HLA files are downloaded.
3635
from pyard.smart_sort import smart_sort_comparator
@@ -46,6 +45,7 @@
4645
"lgx_group",
4746
"exon_group",
4847
"p_group",
48+
"p_not_g",
4949
]
5050
ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)
5151

@@ -102,6 +102,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
102102
p_group = db.load_dict(
103103
db_connection, table_name="p_group", columns=("allele", "p")
104104
)
105+
p_not_g = db.load_dict(
106+
db_connection, table_name="p_not_g", columns=("allele", "lgx")
107+
)
105108
return ARSMapping(
106109
dup_g=dup_g,
107110
dup_lg=dup_lg,
@@ -111,13 +114,46 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
111114
lgx_group=lgx_group,
112115
exon_group=exon_group,
113116
p_group=p_group,
117+
p_not_g=p_not_g,
114118
)
115119

120+
# load the hla_nom_g.txt
116121
ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt"
117122
df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
118123

124+
# the G-group is named for its first allele
125+
df["G"] = df["A"].apply(get_G_name)
126+
127+
# load the hla_nom_p.txt
128+
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
129+
# example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
130+
df_P = pd.read_csv(
131+
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
132+
).dropna()
133+
134+
# the P-group is named for its first allele
135+
df_P["P"] = df_P["A"].apply(get_P_name)
136+
137+
# convert slash delimited string to a list
138+
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
139+
df_P = df_P.explode("A")
140+
# C* 06:06:01:01/06:06:01:02/06:271 06:06P
141+
df_P["A"] = df_P["Locus"] + df_P["A"]
142+
df_P["P"] = df_P["Locus"] + df_P["P"]
143+
# C* 06:06:01:01 06:06P
144+
# C* 06:06:01:02 06:06P
145+
# C* 06:271 06:06P
146+
p_group = df_P.set_index("A")["P"].to_dict()
147+
df_P["2d"] = df_P["A"].apply(get_2field_allele)
148+
# lgx has the P-group name without the P for comparison
149+
df_P["lgx"] = df_P["P"].apply(get_2field_allele)
150+
151+
# convert slash delimited string to a list
119152
df["A"] = df["A"].apply(lambda a: a.split("/"))
153+
# convert the list into separate rows for each element
120154
df = df.explode("A")
155+
156+
# A* + 02:01 = A*02:01
121157
df["A"] = df["Locus"] + df["A"]
122158
df["G"] = df["Locus"] + df["G"]
123159

@@ -126,8 +162,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
126162
df["lg"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
127163
df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]))
128164

165+
# compare df_P["2d"] with df["2d"] to find 2-field alleles in the
166+
# P-group that aren't in the G-group
167+
PnotinG = set(df_P["2d"]) - set(df["2d"])
168+
169+
# filter to find these 2-field alleles (2d) in the P-group data frame
170+
df_PnotG = df_P[df_P["2d"].isin(PnotinG)]
171+
172+
# dictionary which will define the table
173+
p_not_g = df_PnotG.set_index("A")["lgx"].to_dict()
174+
129175
# multiple Gs
176+
# goal: identify 2-field alleles that are in multiple G-groups
177+
178+
# group by 2d and G, and select the 2d column and count the columns
130179
mg = df.drop_duplicates(["2d", "G"])["2d"].value_counts()
180+
# filter out the mg with count > 1, leaving only duplicates
181+
# take the index from the 2d version the data frame, make that a column
182+
# and turn that into a list
131183
multiple_g_list = mg[mg > 1].reset_index()["index"].to_list()
132184

133185
# Keep only the alleles that have more than 1 mapping
@@ -202,18 +254,13 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
202254
)
203255
exon_group = df_exon.set_index("A")["exon"].to_dict()
204256

205-
# P groups
206-
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
207-
df_P = pd.read_csv(
208-
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
209-
).dropna()
210-
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
211-
df_P = df_P.explode("A")
212-
df_P["A"] = df_P["Locus"] + df_P["A"]
213-
df_P["P"] = df_P["Locus"] + df_P["P"]
214-
p_group = df_P.set_index("A")["P"].to_dict()
215-
216257
# save
258+
db.save_dict(
259+
db_connection,
260+
table_name="p_not_g",
261+
dictionary=p_not_g,
262+
columns=("allele", "lgx"),
263+
)
217264
db.save_dict(
218265
db_connection,
219266
table_name="dup_g",
@@ -256,7 +303,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
256303
db.save_dict(
257304
db_connection,
258305
table_name="p_group",
259-
dictionary=exon_group,
306+
dictionary=p_group,
260307
columns=("allele", "p"),
261308
)
262309

@@ -269,6 +316,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
269316
lgx_group=lgx_group,
270317
exon_group=exon_group,
271318
p_group=p_group,
319+
p_not_g=p_not_g,
272320
)
273321

274322

pyard/misc.py

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# List of expression characters
22
expression_chars = ["N", "Q", "L", "S"]
3+
# List of P and G characters
4+
PandG_chars = ["P", "G"]
35

46

57
def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
@@ -20,12 +22,44 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
2022

2123

2224
def get_3field_allele(a: str) -> str:
25+
last_char = a[-1]
26+
if last_char in PandG_chars:
27+
a = a[:-1]
28+
2329
return get_n_field_allele(a, 3)
2430

2531

2632
def get_2field_allele(a: str) -> str:
33+
last_char = a[-1]
34+
if last_char in PandG_chars:
35+
a = a[:-1]
2736
return get_n_field_allele(a, 2)
2837

2938

3039
def number_of_fields(allele: str) -> int:
3140
return len(allele.split(":"))
41+
42+
43+
# computes a valid G name based on the ambiguity string
44+
def get_G_name(a: str) -> str:
45+
a = a.split("/")[0]
46+
last_char = a[-1]
47+
if last_char in PandG_chars + expression_chars:
48+
a = a[:-1]
49+
if len(a.split(":")) == 2:
50+
return ":".join([a, "01"]) + "G"
51+
else:
52+
return ":".join(a.split(":")[0:3]) + "G"
53+
54+
55+
# computes a valid P name based on the ambiguity string
56+
def get_P_name(a: str) -> str:
57+
a = a.split("/")[0]
58+
last_char = a[-1]
59+
if last_char in PandG_chars + expression_chars:
60+
a = a[:-1]
61+
return ":".join(a.split(":")[0:2]) + "P"
62+
63+
64+
def number_of_fields(allele: str) -> int:
65+
return len(allele.split(":"))

pyard/pyard.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
"reduce_XX": True,
4949
"reduce_MAC": True,
5050
"reduce_shortnull": True,
51+
"ping": False,
5152
"map_drb345_to_drbx": True,
5253
"verbose_log": True,
5354
}
@@ -140,7 +141,7 @@ def __del__(self):
140141
self.db_connection.close()
141142

142143
@functools.lru_cache(maxsize=max_cache_size)
143-
def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
144+
def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES, reping=True) -> str:
144145
"""
145146
Does ARS reduction with allele and ARS type
146147
@@ -172,6 +173,13 @@ def redux(self, allele: str, redux_type: VALID_REDUCTION_TYPES) -> str:
172173
if allele.endswith(("P", "G")):
173174
if redux_type in ["lg", "lgx", "G"]:
174175
allele = allele[:-1]
176+
if self._config["ping"] and reping:
177+
if redux_type in ("lg", "lgx", "U2"):
178+
if allele in self.ars_mappings.p_not_g:
179+
return self.ars_mappings.p_not_g[allele]
180+
else:
181+
return self.redux(allele, redux_type, False)
182+
175183
if redux_type == "G" and allele in self.ars_mappings.g_group:
176184
if allele in self.ars_mappings.dup_g:
177185
return self.ars_mappings.dup_g[allele]

tests/environment.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,9 @@
33

44
def before_all(context):
55
context.ard = ARD("3440", data_dir="/tmp/py-ard")
6+
7+
# an ard with ping set to True
8+
my_config = {
9+
"ping": True,
10+
}
11+
context.ard_ping = ARD("3440", data_dir="/tmp/py-ard", config=my_config)

tests/features/allele.feature

Lines changed: 20 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,20 @@
11
Feature: Alleles
22

3-
Scenario Outline:
3+
Scenario Outline: allele reduction with ping
4+
5+
Given the allele as <Allele>
6+
When reducing on the <Level> level with ping
7+
Then the reduced allele is found to be <Redux Allele>
8+
9+
Examples:
10+
| Allele | Level | Redux Allele |
11+
| C*02:02 | lg | C*02:02g |
12+
| C*02:02 | lgx | C*02:02 |
13+
| C*02:10 | lg | C*02:02g |
14+
| C*02:10 | lgx | C*02:02 |
15+
| C*06:17 | lgx | C*06:02 |
16+
17+
Scenario Outline: allele reduction
418

519
Given the allele as <Allele>
620
When reducing on the <Level> level
@@ -21,5 +35,8 @@ Feature: Alleles
2135

2236
| DRB1*14:06:01 | lgx | DRB1*14:06 |
2337
| DRB1*14:06:01 | lg | DRB1*14:06g |
24-
| C*02:02 | lg | C*02:02g/C*02:10g |
25-
| C*02:02 | lgx | C*02:02/C*02:10 |
38+
| C*02:02 | lg | C*02:02g |
39+
| C*02:02 | lgx | C*02:02 |
40+
| C*02:10 | lg | C*02:02g |
41+
| C*02:10 | lgx | C*02:02 |
42+
| C*06:17 | lgx | C*06:17 |

tests/steps/redux_allele.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,12 @@ def step_impl(context, level):
1515
context.redux_allele = context.ard.redux(context.allele, level)
1616

1717

18+
@when("reducing on the {level} level with ping")
19+
def step_impl(context, level):
20+
context.level = level
21+
context.redux_allele = context.ard_ping.redux(context.allele, level)
22+
23+
1824
@when("reducing on the {level} level (ambiguous)")
1925
def step_impl(context, level):
2026
context.level = level

tests/test_pyard.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -156,4 +156,4 @@ def test_allele_duplicated(self):
156156
# https://github.com/nmdp-bioinformatics/py-ard/issues/135
157157
allele_code = "C*02:ACMGS"
158158
allele_code_rx = self.ard.redux_gl(allele_code, "lgx")
159-
self.assertEqual(allele_code_rx, "C*02:02/C*02:10")
159+
self.assertEqual(allele_code_rx, "C*02:02")

0 commit comments

Comments
 (0)