Skip to content

Commit d327445

Browse files
authored
Merge pull request #189 from pbashyal-nmdp/remove_p_group_table
Don't save the p_group table
2 parents 2917c0e + c4abd35 commit d327445

File tree

3 files changed

+103
-95
lines changed

3 files changed

+103
-95
lines changed

pyard/data_repository.py

Lines changed: 94 additions & 93 deletions
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,12 @@
2828
from . import db
2929
from .broad_splits import broad_splits_dna_mapping
3030
from .broad_splits import broad_splits_ser_mapping
31-
from .misc import get_2field_allele, get_3field_allele, number_of_fields
31+
from .misc import (
32+
get_2field_allele,
33+
get_3field_allele,
34+
number_of_fields,
35+
get_1field_allele,
36+
)
3237
from .misc import expression_chars, get_G_name, get_P_name
3338

3439
# GitHub URL where IMGT HLA files are downloaded.
@@ -42,7 +47,6 @@
4247
"g_group",
4348
"lgx_group",
4449
"exon_group",
45-
"p_group",
4650
"p_not_g",
4751
]
4852
ARSMapping = namedtuple("ARSMapping", ars_mapping_tables)
@@ -91,75 +95,35 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
9195
exon_group = db.load_dict(
9296
db_connection, table_name="exon_group", columns=("allele", "exon")
9397
)
94-
p_group = db.load_dict(
95-
db_connection, table_name="p_group", columns=("allele", "p")
96-
)
9798
p_not_g = db.load_dict(
9899
db_connection, table_name="p_not_g", columns=("allele", "lgx")
99100
)
100-
return ARSMapping(
101-
dup_g=dup_g,
102-
dup_lgx=dup_lgx,
103-
g_group=g_group,
104-
lgx_group=lgx_group,
105-
exon_group=exon_group,
106-
p_group=p_group,
107-
p_not_g=p_not_g,
101+
return (
102+
ARSMapping(
103+
dup_g=dup_g,
104+
dup_lgx=dup_lgx,
105+
g_group=g_group,
106+
lgx_group=lgx_group,
107+
exon_group=exon_group,
108+
p_not_g=p_not_g,
109+
),
110+
None,
108111
)
109112

110-
# load the hla_nom_g.txt
111-
ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt"
112-
df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
113-
114-
# the G-group is named for its first allele
115-
df["G"] = df["A"].apply(get_G_name)
116-
117-
# load the hla_nom_p.txt
118-
ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
119-
# example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
120-
df_P = pd.read_csv(
121-
ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
122-
).dropna()
113+
df = load_g_group(imgt_version)
123114

124-
# the P-group is named for its first allele
125-
df_P["P"] = df_P["A"].apply(get_P_name)
115+
df_p_group = load_p_group(imgt_version)
116+
p_group = df_p_group.set_index("A")["P"].to_dict()
126117

127-
# convert slash delimited string to a list
128-
df_P["A"] = df_P["A"].apply(lambda a: a.split("/"))
129-
df_P = df_P.explode("A")
130-
# C* 06:06:01:01/06:06:01:02/06:271 06:06P
131-
df_P["A"] = df_P["Locus"] + df_P["A"]
132-
df_P["P"] = df_P["Locus"] + df_P["P"]
133-
# C* 06:06:01:01 06:06P
134-
# C* 06:06:01:02 06:06P
135-
# C* 06:271 06:06P
136-
p_group = df_P.set_index("A")["P"].to_dict()
137-
df_P["2d"] = df_P["A"].apply(get_2field_allele)
138-
# lgx has the P-group name without the P for comparison
139-
df_P["lgx"] = df_P["P"].apply(get_2field_allele)
140-
141-
# convert slash delimited string to a list
142-
df["A"] = df["A"].apply(lambda a: a.split("/"))
143-
# convert the list into separate rows for each element
144-
df = df.explode("A")
145-
146-
# A* + 02:01 = A*02:01
147-
df["A"] = df["Locus"] + df["A"]
148-
df["G"] = df["Locus"] + df["G"]
149-
150-
df["2d"] = df["A"].apply(get_2field_allele)
151-
df["3d"] = df["A"].apply(get_3field_allele)
152-
df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2]))
153-
154-
# compare df_P["2d"] with df["2d"] to find 2-field alleles in the
118+
# compare df_p_group["2d"] with df["2d"] to find 2-field alleles in the
155119
# P-group that aren't in the G-group
156-
PnotinG = set(df_P["2d"]) - set(df["2d"])
120+
p_not_in_g = set(df_p_group["2d"]) - set(df["2d"])
157121

158122
# filter to find these 2-field alleles (2d) in the P-group data frame
159-
df_PnotG = df_P[df_P["2d"].isin(PnotinG)]
123+
df_p_not_g = df_p_group[df_p_group["2d"].isin(p_not_in_g)]
160124

161125
# dictionary which will define the table
162-
p_not_g = df_PnotG.set_index("A")["lgx"].to_dict()
126+
p_not_g = df_p_not_g.set_index("A")["lgx"].to_dict()
163127

164128
# multiple Gs
165129
# goal: identify 2-field alleles that are in multiple G-groups
@@ -255,26 +219,68 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
255219
dictionary=exon_group,
256220
columns=("allele", "exon"),
257221
)
258-
db.save_dict(
259-
db_connection,
260-
table_name="p_group",
261-
dictionary=p_group,
262-
columns=("allele", "p"),
263-
)
264222

265-
return ARSMapping(
266-
dup_g=dup_g,
267-
dup_lgx=dup_lgx,
268-
g_group=g_group,
269-
lgx_group=lgx_group,
270-
exon_group=exon_group,
271-
p_group=p_group,
272-
p_not_g=p_not_g,
223+
return (
224+
ARSMapping(
225+
dup_g=dup_g,
226+
dup_lgx=dup_lgx,
227+
g_group=g_group,
228+
lgx_group=lgx_group,
229+
exon_group=exon_group,
230+
p_not_g=p_not_g,
231+
),
232+
p_group,
273233
)
274234

275235

236+
def load_g_group(imgt_version):
237+
# load the hla_nom_g.txt
238+
ars_g_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt"
239+
df = pd.read_csv(ars_g_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
240+
# the G-group is named for its first allele
241+
df["G"] = df["A"].apply(get_G_name)
242+
# convert slash delimited string to a list
243+
df["A"] = df["A"].apply(lambda a: a.split("/"))
244+
# convert the list into separate rows for each element
245+
df = df.explode("A")
246+
# A* + 02:01 = A*02:01
247+
df["A"] = df["Locus"] + df["A"]
248+
df["G"] = df["Locus"] + df["G"]
249+
# Create 2,3 field versions of the alleles
250+
df["2d"] = df["A"].apply(get_2field_allele)
251+
df["3d"] = df["A"].apply(get_3field_allele)
252+
# lgx is 2 field version of the G group allele
253+
df["lgx"] = df["G"].apply(get_2field_allele)
254+
255+
return df
256+
257+
258+
def load_p_group(imgt_version):
259+
# load the hla_nom_p.txt
260+
ars_p_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt"
261+
# example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
262+
df_p = pd.read_csv(
263+
ars_p_url, skiprows=6, names=["Locus", "A", "P"], sep=";"
264+
).dropna()
265+
# the P-group is named for its first allele
266+
df_p["P"] = df_p["A"].apply(get_P_name)
267+
# convert slash delimited string to a list
268+
df_p["A"] = df_p["A"].apply(lambda a: a.split("/"))
269+
df_p = df_p.explode("A")
270+
# C* 06:06:01:01/06:06:01:02/06:271 06:06P
271+
df_p["A"] = df_p["Locus"] + df_p["A"]
272+
df_p["P"] = df_p["Locus"] + df_p["P"]
273+
# C* 06:06:01:01 06:06P
274+
# C* 06:06:01:02 06:06P
275+
# C* 06:271 06:06P
276+
df_p["2d"] = df_p["A"].apply(get_2field_allele)
277+
# lgx has the P-group name without the P for comparison
278+
df_p["lgx"] = df_p["P"].apply(get_2field_allele)
279+
return df_p
280+
281+
276282
def generate_alleles_and_xx_codes_and_who(
277-
db_connection: sqlite3.Connection, imgt_version, ars_mappings
283+
db_connection: sqlite3.Connection, imgt_version, ars_mappings, p_group
278284
):
279285
"""
280286
Checks to see if there's already an allele list file for the `imgt_version`
@@ -386,28 +392,23 @@ def generate_alleles_and_xx_codes_and_who(
386392
db.save_dict(db_connection, "xx_codes", flat_xx_codes, ("allele_1d", "allele_list"))
387393

388394
# W H O
389-
who_alleles = set(allele_df["Allele"])
395+
who_alleles = allele_df["Allele"].to_list()
390396
# Save this version of the WHO alleles
391397
db.save_set(db_connection, "who_alleles", who_alleles, "allele")
398+
392399
# Create WHO mapping from the unique alleles in the 1-field column
393-
unique_alleles = allele_df["Allele"].unique()
394-
who_df1 = pd.DataFrame(unique_alleles, columns=["Allele"])
395-
who_df1["nd"] = allele_df["Allele"].apply(lambda x: x.split(":")[0])
396-
# Create WHO mapping from the unique alleles in the 2-field column
397-
who_df2 = pd.DataFrame(unique_alleles, columns=["Allele"])
398-
who_df2["nd"] = allele_df["Allele"].apply(get_2field_allele)
399-
# Create WHO mapping from the unique alleles in the 3-field column
400-
who_df3 = pd.DataFrame(unique_alleles, columns=["Allele"])
401-
who_df3["nd"] = allele_df["Allele"].apply(get_3field_allele)
402-
# Combine n-field dataframes in 1
403-
404-
# Create g_codes expansion mapping from the same tables used to reduce to G
405-
g_df = pd.DataFrame(list(ars_mappings.g_group.items()), columns=["Allele", "nd"])
406-
407-
# Create p_codes expansion mapping from the p_group table
408-
p_df = pd.DataFrame(list(ars_mappings.p_group.items()), columns=["Allele", "nd"])
409-
410-
who_codes = pd.concat([who_df1, who_df2, who_df3, g_df, p_df])
400+
allele_df["1d"] = allele_df["Allele"].apply(get_1field_allele)
401+
402+
who_codes = pd.concat(
403+
[
404+
allele_df[["Allele", "1d"]].rename(columns={"1d": "nd"}),
405+
allele_df[["Allele", "2d"]].rename(columns={"2d": "nd"}),
406+
allele_df[["Allele", "3d"]].rename(columns={"3d": "nd"}),
407+
pd.DataFrame(ars_mappings.g_group.items(), columns=["Allele", "nd"]),
408+
pd.DataFrame(p_group.items(), columns=["Allele", "nd"]),
409+
],
410+
ignore_index=True,
411+
)
411412

412413
# remove valid alleles from who_codes to avoid recursion
413414
for k in who_alleles:

pyard/misc.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ def get_n_field_allele(allele: str, n: int, preserve_expression=False) -> str:
1111
1212
:param allele: Original allele
1313
:param n: n number of fields to reduce to
14+
:param preserve_expression: keep the expression character ?
1415
:return: trimmed to n fields of the original allele
1516
"""
1617
last_char = allele[-1]
@@ -36,6 +37,10 @@ def get_2field_allele(a: str) -> str:
3637
return get_n_field_allele(a, 2)
3738

3839

40+
def get_1field_allele(a: str) -> str:
41+
return get_n_field_allele(a, 1)
42+
43+
3944
def number_of_fields(allele: str) -> int:
4045
return len(allele.split(":"))
4146

pyard/pyard.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -102,7 +102,9 @@ def __init__(
102102
# Load MAC codes
103103
dr.generate_mac_codes(self.db_connection, False)
104104
# Load ARS mappings
105-
self.ars_mappings = dr.generate_ars_mapping(self.db_connection, imgt_version)
105+
self.ars_mappings, p_group = dr.generate_ars_mapping(
106+
self.db_connection, imgt_version
107+
)
106108
# Load Alleles and XX Codes
107109
(
108110
self.valid_alleles,
@@ -111,7 +113,7 @@ def __init__(
111113
self.who_group,
112114
self.exp_alleles,
113115
) = dr.generate_alleles_and_xx_codes_and_who(
114-
self.db_connection, imgt_version, self.ars_mappings
116+
self.db_connection, imgt_version, self.ars_mappings, p_group
115117
)
116118

117119
# Generate short nulls from WHO mapping

0 commit comments

Comments
 (0)