2323from collections import namedtuple
2424import functools
2525import sqlite3
26-
2726import pandas as pd
2827
2928from . import db
3029from .broad_splits import broad_splits_dna_mapping
3130from .broad_splits import broad_splits_ser_mapping
3231from .misc import get_2field_allele , get_3field_allele , number_of_fields
33- from .misc import expression_chars
32+ from .misc import expression_chars , get_G_name , get_P_name
3433
3534# GitHub URL where IMGT HLA files are downloaded.
3635from pyard .smart_sort import smart_sort_comparator
4645 "lgx_group" ,
4746 "exon_group" ,
4847 "p_group" ,
48+ "p_not_g" ,
4949]
5050ARSMapping = namedtuple ("ARSMapping" , ars_mapping_tables )
5151
@@ -102,6 +102,9 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
102102 p_group = db .load_dict (
103103 db_connection , table_name = "p_group" , columns = ("allele" , "p" )
104104 )
105+ p_not_g = db .load_dict (
106+ db_connection , table_name = "p_not_g" , columns = ("allele" , "lgx" )
107+ )
105108 return ARSMapping (
106109 dup_g = dup_g ,
107110 dup_lg = dup_lg ,
@@ -111,13 +114,46 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
111114 lgx_group = lgx_group ,
112115 exon_group = exon_group ,
113116 p_group = p_group ,
117+ p_not_g = p_not_g ,
114118 )
115119
120+ # load the hla_nom_g.txt
116121 ars_G_url = f"{ IMGT_HLA_URL } { imgt_version } /wmda/hla_nom_g.txt"
117122 df = pd .read_csv (ars_G_url , skiprows = 6 , names = ["Locus" , "A" , "G" ], sep = ";" ).dropna ()
118123
124+ # the G-group is named for its first allele
125+ df ["G" ] = df ["A" ].apply (get_G_name )
126+
127+ # load the hla_nom_p.txt
128+ ars_P_url = f"{ IMGT_HLA_URL } { imgt_version } /wmda/hla_nom_p.txt"
129+ # example: C*;06:06:01:01/06:06:01:02/06:271;06:06P
130+ df_P = pd .read_csv (
131+ ars_P_url , skiprows = 6 , names = ["Locus" , "A" , "P" ], sep = ";"
132+ ).dropna ()
133+
134+ # the P-group is named for its first allele
135+ df_P ["P" ] = df_P ["A" ].apply (get_P_name )
136+
137+ # convert slash delimited string to a list
138+ df_P ["A" ] = df_P ["A" ].apply (lambda a : a .split ("/" ))
139+ df_P = df_P .explode ("A" )
140+ # C* 06:06:01:01/06:06:01:02/06:271 06:06P
141+ df_P ["A" ] = df_P ["Locus" ] + df_P ["A" ]
142+ df_P ["P" ] = df_P ["Locus" ] + df_P ["P" ]
143+ # C* 06:06:01:01 06:06P
144+ # C* 06:06:01:02 06:06P
145+ # C* 06:271 06:06P
146+ p_group = df_P .set_index ("A" )["P" ].to_dict ()
147+ df_P ["2d" ] = df_P ["A" ].apply (get_2field_allele )
148+ # lgx has the P-group name without the P for comparison
149+ df_P ["lgx" ] = df_P ["P" ].apply (get_2field_allele )
150+
151+ # convert slash delimited string to a list
119152 df ["A" ] = df ["A" ].apply (lambda a : a .split ("/" ))
153+ # convert the list into separate rows for each element
120154 df = df .explode ("A" )
155+
156+ # A* + 02:01 = A*02:01
121157 df ["A" ] = df ["Locus" ] + df ["A" ]
122158 df ["G" ] = df ["Locus" ] + df ["G" ]
123159
@@ -126,8 +162,24 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
126162 df ["lg" ] = df ["G" ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]) + "g" )
127163 df ["lgx" ] = df ["G" ].apply (lambda a : ":" .join (a .split (":" )[0 :2 ]))
128164
165+ # compare df_P["2d"] with df["2d"] to find 2-field alleles in the
166+ # P-group that aren't in the G-group
167+ PnotinG = set (df_P ["2d" ]) - set (df ["2d" ])
168+
169+ # filter to find these 2-field alleles (2d) in the P-group data frame
170+ df_PnotG = df_P [df_P ["2d" ].isin (PnotinG )]
171+
172+ # dictionary which will define the table
173+ p_not_g = df_PnotG .set_index ("A" )["lgx" ].to_dict ()
174+
129175 # multiple Gs
176+ # goal: identify 2-field alleles that are in multiple G-groups
177+
178+ # group by 2d and G, and select the 2d column and count the columns
130179 mg = df .drop_duplicates (["2d" , "G" ])["2d" ].value_counts ()
180+ # filter out the mg with count > 1, leaving only duplicates
181+ # take the index from the 2d version the data frame, make that a column
182+ # and turn that into a list
131183 multiple_g_list = mg [mg > 1 ].reset_index ()["index" ].to_list ()
132184
133185 # Keep only the alleles that have more than 1 mapping
@@ -202,18 +254,13 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
202254 )
203255 exon_group = df_exon .set_index ("A" )["exon" ].to_dict ()
204256
205- # P groups
206- ars_P_url = f"{ IMGT_HLA_URL } { imgt_version } /wmda/hla_nom_p.txt"
207- df_P = pd .read_csv (
208- ars_P_url , skiprows = 6 , names = ["Locus" , "A" , "P" ], sep = ";"
209- ).dropna ()
210- df_P ["A" ] = df_P ["A" ].apply (lambda a : a .split ("/" ))
211- df_P = df_P .explode ("A" )
212- df_P ["A" ] = df_P ["Locus" ] + df_P ["A" ]
213- df_P ["P" ] = df_P ["Locus" ] + df_P ["P" ]
214- p_group = df_P .set_index ("A" )["P" ].to_dict ()
215-
216257 # save
258+ db .save_dict (
259+ db_connection ,
260+ table_name = "p_not_g" ,
261+ dictionary = p_not_g ,
262+ columns = ("allele" , "lgx" ),
263+ )
217264 db .save_dict (
218265 db_connection ,
219266 table_name = "dup_g" ,
@@ -256,7 +303,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
256303 db .save_dict (
257304 db_connection ,
258305 table_name = "p_group" ,
259- dictionary = exon_group ,
306+ dictionary = p_group ,
260307 columns = ("allele" , "p" ),
261308 )
262309
@@ -269,6 +316,7 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
269316 lgx_group = lgx_group ,
270317 exon_group = exon_group ,
271318 p_group = p_group ,
319+ p_not_g = p_not_g ,
272320 )
273321
274322
0 commit comments