|
| 1 | +import functools |
1 | 2 | import sqlite3 |
2 | 3 |
|
3 | 4 | import pandas as pd |
|
6 | 7 | from pyard.broad_splits import broad_splits_mapping |
7 | 8 |
|
8 | 9 | # GitHub URL where IMGT HLA files are downloaded. |
| 10 | +from pyard.smart_sort import smart_sort_comparator |
| 11 | + |
9 | 12 | IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' |
10 | 13 |
|
11 | 14 | # List of expression characters |
@@ -97,65 +100,6 @@ def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): |
97 | 100 | return dup_g, g_group, lg_group, lgx_group |
98 | 101 |
|
99 | 102 |
|
100 | | -def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool): |
101 | | - """ |
102 | | - MAC files come in 2 different versions: |
103 | | -
|
104 | | - Martin: when they’re printed, the first is better for encoding and the |
105 | | - second is better for decoding. The entire list was maintained both as an |
106 | | - excel spreadsheet and also as a sybase database table. The excel was the |
107 | | - one that was printed and distributed. |
108 | | -
|
109 | | - **==> numer.v3.txt <==** |
110 | | -
|
111 | | - Sorted by the length and the the values in the list |
112 | | - ``` |
113 | | - "LAST UPDATED: 09/30/20" |
114 | | - CODE SUBTYPE |
115 | | -
|
116 | | - AB 01/02 |
117 | | - AC 01/03 |
118 | | - AD 01/04 |
119 | | - AE 01/05 |
120 | | - AG 01/06 |
121 | | - AH 01/07 |
122 | | - AJ 01/08 |
123 | | - ``` |
124 | | -
|
125 | | - **==> alpha.v3.txt <==** |
126 | | -
|
127 | | - Sorted by the code |
128 | | -
|
129 | | - ``` |
130 | | - "LAST UPDATED: 10/01/20" |
131 | | - * CODE SUBTYPE |
132 | | -
|
133 | | - AA 01/02/03/05 |
134 | | - AB 01/02 |
135 | | - AC 01/03 |
136 | | - AD 01/04 |
137 | | - AE 01/05 |
138 | | - AF 01/09 |
139 | | - AG 01/06 |
140 | | - ``` |
141 | | -
|
142 | | - :param db_connection: |
143 | | - :param data_dir: |
144 | | - :return: |
145 | | - """ |
146 | | - mac_table_name = 'mac_codes' |
147 | | - if refresh_mac or not db.table_exists(db_connection, mac_table_name): |
148 | | - # Load the MAC file to a DataFrame |
149 | | - mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip' |
150 | | - df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', |
151 | | - skiprows=3, names=['Code', 'Alleles']) |
152 | | - # Create a dict from code to alleles |
153 | | - mac = df_mac.set_index("Code")["Alleles"].to_dict() |
154 | | - # Save the mac dict to db |
155 | | - db.save_dict(db_connection, table_name=mac_table_name, |
156 | | - dictionary=mac, columns=('code', 'alleles')) |
157 | | - |
158 | | - |
159 | 103 | def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version): |
160 | 104 | """ |
161 | 105 | Checks to see if there's already an allele list file for the `imgt_version` |
@@ -226,10 +170,110 @@ def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_versio |
226 | 170 | else: |
227 | 171 | xx_codes[broad] = xx_codes[split] |
228 | 172 |
|
229 | | - # Save this version of the valid alleles and xx codes |
| 173 | + # Save this version of the valid alleles |
230 | 174 | db.save_set(db_connection, 'alleles', valid_alleles, 'allele') |
231 | | - flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()} |
| 175 | + # Save this version of xx codes |
| 176 | + flat_xx_codes = {k: '/'.join(sorted(v, key=functools.cmp_to_key(smart_sort_comparator))) |
| 177 | + for k, v in xx_codes.items()} |
232 | 178 | db.save_dict(db_connection, 'xx_codes', flat_xx_codes, |
233 | 179 | ('allele_1d', 'allele_list')) |
234 | 180 |
|
235 | 181 | return valid_alleles, xx_codes |
| 182 | + |
| 183 | + |
| 184 | +def generate_mac_codes(db_connection: sqlite3.Connection, refresh_mac: bool): |
| 185 | + """ |
| 186 | + MAC files come in 2 different versions: |
| 187 | +
|
| 188 | + Martin: when they’re printed, the first is better for encoding and the |
| 189 | + second is better for decoding. The entire list was maintained both as an |
| 190 | + excel spreadsheet and also as a sybase database table. The excel was the |
| 191 | + one that was printed and distributed. |
| 192 | +
|
| 193 | + **==> numer.v3.txt <==** |
| 194 | +
|
| 195 | + Sorted by the length and the the values in the list |
| 196 | + ``` |
| 197 | + "LAST UPDATED: 09/30/20" |
| 198 | + CODE SUBTYPE |
| 199 | +
|
| 200 | + AB 01/02 |
| 201 | + AC 01/03 |
| 202 | + AD 01/04 |
| 203 | + AE 01/05 |
| 204 | + AG 01/06 |
| 205 | + AH 01/07 |
| 206 | + AJ 01/08 |
| 207 | + ``` |
| 208 | +
|
| 209 | + **==> alpha.v3.txt <==** |
| 210 | +
|
| 211 | + Sorted by the code |
| 212 | +
|
| 213 | + ``` |
| 214 | + "LAST UPDATED: 10/01/20" |
| 215 | + * CODE SUBTYPE |
| 216 | +
|
| 217 | + AA 01/02/03/05 |
| 218 | + AB 01/02 |
| 219 | + AC 01/03 |
| 220 | + AD 01/04 |
| 221 | + AE 01/05 |
| 222 | + AF 01/09 |
| 223 | + AG 01/06 |
| 224 | + ``` |
| 225 | +
|
| 226 | + :param db_connection: Database connection to the sqlite database |
| 227 | + :param refresh_mac: Refresh the database with newer MAC data ? |
| 228 | + :return: None |
| 229 | + """ |
| 230 | + mac_table_name = 'mac_codes' |
| 231 | + if refresh_mac or not db.table_exists(db_connection, mac_table_name): |
| 232 | + # Load the MAC file to a DataFrame |
| 233 | + mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip' |
| 234 | + df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', |
| 235 | + skiprows=3, names=['Code', 'Alleles']) |
| 236 | + # Create a dict from code to alleles |
| 237 | + mac = df_mac.set_index("Code")["Alleles"].to_dict() |
| 238 | + # Save the mac dict to db |
| 239 | + db.save_dict(db_connection, table_name=mac_table_name, |
| 240 | + dictionary=mac, columns=('code', 'alleles')) |
| 241 | + |
| 242 | + |
| 243 | +def generate_serology_mapping(db_connection: sqlite3.Connection, imgt_version): |
| 244 | + if not db.table_exists(db_connection, 'serology_mapping'): |
| 245 | + # Load WMDA serology mapping data |
| 246 | + rel_dna_ser_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/rel_dna_ser.txt' |
| 247 | + df_sero = pd.read_csv(rel_dna_ser_url, sep=';', skiprows=6, |
| 248 | + names=['Locus', 'Allele', 'USA', 'PSA', 'ASA'], |
| 249 | + index_col=False) |
| 250 | + |
| 251 | + # Remove 0 and ? |
| 252 | + df_sero = df_sero[(df_sero != '0') & (df_sero != '?')] |
| 253 | + df_sero['Allele'] = df_sero['Locus'] + df_sero['Allele'] |
| 254 | + |
| 255 | + usa = df_sero[['Locus', 'Allele', 'USA']].dropna() |
| 256 | + usa['Sero'] = usa['Locus'] + usa['USA'] |
| 257 | + |
| 258 | + psa = df_sero[['Locus', 'Allele', 'PSA']].dropna() |
| 259 | + psa['PSA'] = psa['PSA'].apply(lambda row: row.split('/')) |
| 260 | + psa = psa.explode('PSA') |
| 261 | + psa = psa[(psa != '0') & (psa != '?')].dropna() |
| 262 | + psa['Sero'] = psa['Locus'] + psa['PSA'] |
| 263 | + |
| 264 | + asa = df_sero[['Locus', 'Allele', 'ASA']].dropna() |
| 265 | + asa['ASA'] = asa['ASA'].apply(lambda x: x.split('/')) |
| 266 | + asa = asa.explode('ASA') |
| 267 | + asa = asa[(asa != '0') & (asa != '?')].dropna() |
| 268 | + asa['Sero'] = asa['Locus'] + asa['ASA'] |
| 269 | + |
| 270 | + sero_mapping_combined = pd.concat([usa[['Sero', 'Allele']], |
| 271 | + psa[['Sero', 'Allele']], |
| 272 | + asa[['Sero', 'Allele']]]) |
| 273 | + sero_mapping = sero_mapping_combined.groupby('Sero').\ |
| 274 | + apply(lambda x: '/'.join(sorted(x['Allele']))).\ |
| 275 | + to_dict() |
| 276 | + |
| 277 | + # Save the serology mapping to db |
| 278 | + db.save_dict(db_connection, table_name='serology_mapping', |
| 279 | + dictionary=sero_mapping, columns=('serology', 'allele_list')) |
0 commit comments