|
28 | 28 | from . import db |
29 | 29 | from .broad_splits import broad_splits_dna_mapping |
30 | 30 | from .broad_splits import broad_splits_ser_mapping |
31 | | -from .misc import get_2field_allele, get_3field_allele, number_of_fields |
| 31 | +from .misc import ( |
| 32 | + get_2field_allele, |
| 33 | + get_3field_allele, |
| 34 | + number_of_fields, |
| 35 | + get_1field_allele, |
| 36 | +) |
32 | 37 | from .misc import expression_chars, get_G_name, get_P_name |
33 | 38 |
|
34 | 39 | # GitHub URL where IMGT HLA files are downloaded. |
|
42 | 47 | "g_group", |
43 | 48 | "lgx_group", |
44 | 49 | "exon_group", |
45 | | - "p_group", |
46 | 50 | "p_not_g", |
47 | 51 | ] |
48 | 52 | ARSMapping = namedtuple("ARSMapping", ars_mapping_tables) |
@@ -91,75 +95,35 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): |
91 | 95 | exon_group = db.load_dict( |
92 | 96 | db_connection, table_name="exon_group", columns=("allele", "exon") |
93 | 97 | ) |
94 | | - p_group = db.load_dict( |
95 | | - db_connection, table_name="p_group", columns=("allele", "p") |
96 | | - ) |
97 | 98 | p_not_g = db.load_dict( |
98 | 99 | db_connection, table_name="p_not_g", columns=("allele", "lgx") |
99 | 100 | ) |
100 | | - return ARSMapping( |
101 | | - dup_g=dup_g, |
102 | | - dup_lgx=dup_lgx, |
103 | | - g_group=g_group, |
104 | | - lgx_group=lgx_group, |
105 | | - exon_group=exon_group, |
106 | | - p_group=p_group, |
107 | | - p_not_g=p_not_g, |
| 101 | + return ( |
| 102 | + ARSMapping( |
| 103 | + dup_g=dup_g, |
| 104 | + dup_lgx=dup_lgx, |
| 105 | + g_group=g_group, |
| 106 | + lgx_group=lgx_group, |
| 107 | + exon_group=exon_group, |
| 108 | + p_not_g=p_not_g, |
| 109 | + ), |
| 110 | + None, |
108 | 111 | ) |
109 | 112 |
|
110 | | - # load the hla_nom_g.txt |
111 | | - ars_G_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt" |
112 | | - df = pd.read_csv(ars_G_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() |
113 | | - |
114 | | - # the G-group is named for its first allele |
115 | | - df["G"] = df["A"].apply(get_G_name) |
116 | | - |
117 | | - # load the hla_nom_p.txt |
118 | | - ars_P_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt" |
119 | | - # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P |
120 | | - df_P = pd.read_csv( |
121 | | - ars_P_url, skiprows=6, names=["Locus", "A", "P"], sep=";" |
122 | | - ).dropna() |
| 113 | + df = load_g_group(imgt_version) |
123 | 114 |
|
124 | | - # the P-group is named for its first allele |
125 | | - df_P["P"] = df_P["A"].apply(get_P_name) |
| 115 | + df_p_group = load_p_group(imgt_version) |
| 116 | + p_group = df_p_group.set_index("A")["P"].to_dict() |
126 | 117 |
|
127 | | - # convert slash delimited string to a list |
128 | | - df_P["A"] = df_P["A"].apply(lambda a: a.split("/")) |
129 | | - df_P = df_P.explode("A") |
130 | | - # C* 06:06:01:01/06:06:01:02/06:271 06:06P |
131 | | - df_P["A"] = df_P["Locus"] + df_P["A"] |
132 | | - df_P["P"] = df_P["Locus"] + df_P["P"] |
133 | | - # C* 06:06:01:01 06:06P |
134 | | - # C* 06:06:01:02 06:06P |
135 | | - # C* 06:271 06:06P |
136 | | - p_group = df_P.set_index("A")["P"].to_dict() |
137 | | - df_P["2d"] = df_P["A"].apply(get_2field_allele) |
138 | | - # lgx has the P-group name without the P for comparison |
139 | | - df_P["lgx"] = df_P["P"].apply(get_2field_allele) |
140 | | - |
141 | | - # convert slash delimited string to a list |
142 | | - df["A"] = df["A"].apply(lambda a: a.split("/")) |
143 | | - # convert the list into separate rows for each element |
144 | | - df = df.explode("A") |
145 | | - |
146 | | - # A* + 02:01 = A*02:01 |
147 | | - df["A"] = df["Locus"] + df["A"] |
148 | | - df["G"] = df["Locus"] + df["G"] |
149 | | - |
150 | | - df["2d"] = df["A"].apply(get_2field_allele) |
151 | | - df["3d"] = df["A"].apply(get_3field_allele) |
152 | | - df["lgx"] = df["G"].apply(lambda a: ":".join(a.split(":")[0:2])) |
153 | | - |
154 | | - # compare df_P["2d"] with df["2d"] to find 2-field alleles in the |
| 118 | + # compare df_p_group["2d"] with df["2d"] to find 2-field alleles in the |
155 | 119 | # P-group that aren't in the G-group |
156 | | - PnotinG = set(df_P["2d"]) - set(df["2d"]) |
| 120 | + p_not_in_g = set(df_p_group["2d"]) - set(df["2d"]) |
157 | 121 |
|
158 | 122 | # filter to find these 2-field alleles (2d) in the P-group data frame |
159 | | - df_PnotG = df_P[df_P["2d"].isin(PnotinG)] |
| 123 | + df_p_not_g = df_p_group[df_p_group["2d"].isin(p_not_in_g)] |
160 | 124 |
|
161 | 125 | # dictionary which will define the table |
162 | | - p_not_g = df_PnotG.set_index("A")["lgx"].to_dict() |
| 126 | + p_not_g = df_p_not_g.set_index("A")["lgx"].to_dict() |
163 | 127 |
|
164 | 128 | # multiple Gs |
165 | 129 | # goal: identify 2-field alleles that are in multiple G-groups |
@@ -255,26 +219,68 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): |
255 | 219 | dictionary=exon_group, |
256 | 220 | columns=("allele", "exon"), |
257 | 221 | ) |
258 | | - db.save_dict( |
259 | | - db_connection, |
260 | | - table_name="p_group", |
261 | | - dictionary=p_group, |
262 | | - columns=("allele", "p"), |
263 | | - ) |
264 | 222 |
|
265 | | - return ARSMapping( |
266 | | - dup_g=dup_g, |
267 | | - dup_lgx=dup_lgx, |
268 | | - g_group=g_group, |
269 | | - lgx_group=lgx_group, |
270 | | - exon_group=exon_group, |
271 | | - p_group=p_group, |
272 | | - p_not_g=p_not_g, |
| 223 | + return ( |
| 224 | + ARSMapping( |
| 225 | + dup_g=dup_g, |
| 226 | + dup_lgx=dup_lgx, |
| 227 | + g_group=g_group, |
| 228 | + lgx_group=lgx_group, |
| 229 | + exon_group=exon_group, |
| 230 | + p_not_g=p_not_g, |
| 231 | + ), |
| 232 | + p_group, |
273 | 233 | ) |
274 | 234 |
|
275 | 235 |
|
| 236 | +def load_g_group(imgt_version): |
| 237 | + # load the hla_nom_g.txt |
| 238 | + ars_g_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt" |
| 239 | + df = pd.read_csv(ars_g_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() |
| 240 | + # the G-group is named for its first allele |
| 241 | + df["G"] = df["A"].apply(get_G_name) |
| 242 | + # convert slash delimited string to a list |
| 243 | + df["A"] = df["A"].apply(lambda a: a.split("/")) |
| 244 | + # convert the list into separate rows for each element |
| 245 | + df = df.explode("A") |
| 246 | + # A* + 02:01 = A*02:01 |
| 247 | + df["A"] = df["Locus"] + df["A"] |
| 248 | + df["G"] = df["Locus"] + df["G"] |
| 249 | + # Create 2,3 field versions of the alleles |
| 250 | + df["2d"] = df["A"].apply(get_2field_allele) |
| 251 | + df["3d"] = df["A"].apply(get_3field_allele) |
| 252 | + # lgx is 2 field version of the G group allele |
| 253 | + df["lgx"] = df["G"].apply(get_2field_allele) |
| 254 | + |
| 255 | + return df |
| 256 | + |
| 257 | + |
| 258 | +def load_p_group(imgt_version): |
| 259 | + # load the hla_nom_p.txt |
| 260 | + ars_p_url = f"{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_p.txt" |
| 261 | + # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P |
| 262 | + df_p = pd.read_csv( |
| 263 | + ars_p_url, skiprows=6, names=["Locus", "A", "P"], sep=";" |
| 264 | + ).dropna() |
| 265 | + # the P-group is named for its first allele |
| 266 | + df_p["P"] = df_p["A"].apply(get_P_name) |
| 267 | + # convert slash delimited string to a list |
| 268 | + df_p["A"] = df_p["A"].apply(lambda a: a.split("/")) |
| 269 | + df_p = df_p.explode("A") |
| 270 | + # C* 06:06:01:01/06:06:01:02/06:271 06:06P |
| 271 | + df_p["A"] = df_p["Locus"] + df_p["A"] |
| 272 | + df_p["P"] = df_p["Locus"] + df_p["P"] |
| 273 | + # C* 06:06:01:01 06:06P |
| 274 | + # C* 06:06:01:02 06:06P |
| 275 | + # C* 06:271 06:06P |
| 276 | + df_p["2d"] = df_p["A"].apply(get_2field_allele) |
| 277 | + # lgx has the P-group name without the P for comparison |
| 278 | + df_p["lgx"] = df_p["P"].apply(get_2field_allele) |
| 279 | + return df_p |
| 280 | + |
| 281 | + |
276 | 282 | def generate_alleles_and_xx_codes_and_who( |
277 | | - db_connection: sqlite3.Connection, imgt_version, ars_mappings |
| 283 | + db_connection: sqlite3.Connection, imgt_version, ars_mappings, p_group |
278 | 284 | ): |
279 | 285 | """ |
280 | 286 | Checks to see if there's already an allele list file for the `imgt_version` |
@@ -386,28 +392,23 @@ def generate_alleles_and_xx_codes_and_who( |
386 | 392 | db.save_dict(db_connection, "xx_codes", flat_xx_codes, ("allele_1d", "allele_list")) |
387 | 393 |
|
388 | 394 | # W H O |
389 | | - who_alleles = set(allele_df["Allele"]) |
| 395 | + who_alleles = allele_df["Allele"].to_list() |
390 | 396 | # Save this version of the WHO alleles |
391 | 397 | db.save_set(db_connection, "who_alleles", who_alleles, "allele") |
| 398 | + |
392 | 399 | # Create WHO mapping from the unique alleles in the 1-field column |
393 | | - unique_alleles = allele_df["Allele"].unique() |
394 | | - who_df1 = pd.DataFrame(unique_alleles, columns=["Allele"]) |
395 | | - who_df1["nd"] = allele_df["Allele"].apply(lambda x: x.split(":")[0]) |
396 | | - # Create WHO mapping from the unique alleles in the 2-field column |
397 | | - who_df2 = pd.DataFrame(unique_alleles, columns=["Allele"]) |
398 | | - who_df2["nd"] = allele_df["Allele"].apply(get_2field_allele) |
399 | | - # Create WHO mapping from the unique alleles in the 3-field column |
400 | | - who_df3 = pd.DataFrame(unique_alleles, columns=["Allele"]) |
401 | | - who_df3["nd"] = allele_df["Allele"].apply(get_3field_allele) |
402 | | - # Combine n-field dataframes in 1 |
403 | | - |
404 | | - # Create g_codes expansion mapping from the same tables used to reduce to G |
405 | | - g_df = pd.DataFrame(list(ars_mappings.g_group.items()), columns=["Allele", "nd"]) |
406 | | - |
407 | | - # Create p_codes expansion mapping from the p_group table |
408 | | - p_df = pd.DataFrame(list(ars_mappings.p_group.items()), columns=["Allele", "nd"]) |
409 | | - |
410 | | - who_codes = pd.concat([who_df1, who_df2, who_df3, g_df, p_df]) |
| 400 | + allele_df["1d"] = allele_df["Allele"].apply(get_1field_allele) |
| 401 | + |
| 402 | + who_codes = pd.concat( |
| 403 | + [ |
| 404 | + allele_df[["Allele", "1d"]].rename(columns={"1d": "nd"}), |
| 405 | + allele_df[["Allele", "2d"]].rename(columns={"2d": "nd"}), |
| 406 | + allele_df[["Allele", "3d"]].rename(columns={"3d": "nd"}), |
| 407 | + pd.DataFrame(ars_mappings.g_group.items(), columns=["Allele", "nd"]), |
| 408 | + pd.DataFrame(p_group.items(), columns=["Allele", "nd"]), |
| 409 | + ], |
| 410 | + ignore_index=True, |
| 411 | + ) |
411 | 412 |
|
412 | 413 | # remove valid alleles from who_codes to avoid recursion |
413 | 414 | for k in who_alleles: |
|
0 commit comments