|
| 1 | +import sqlite3 |
| 2 | + |
| 3 | +import pandas as pd |
| 4 | + |
| 5 | +from pyard import db |
| 6 | +from pyard.broad_splits import broad_splits_mapping |
| 7 | + |
| 8 | +# GitHub URL where IMGT HLA files are downloaded. |
| 9 | +IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/' |
| 10 | + |
| 11 | +# List of expression characters |
| 12 | +expression_chars = ['N', 'Q', 'L', 'S'] |
| 13 | + |
| 14 | + |
| 15 | +def get_n_field_allele(allele: str, n: int) -> str: |
| 16 | + """ |
| 17 | + Given an HLA allele of >= n field, return n field allele. |
| 18 | + Preserve the expression character if it exists |
| 19 | +
|
| 20 | + :param allele: Original allele |
| 21 | + :param n: n number of fields to reduce to |
| 22 | + :return: trimmed to n fields of the original allele |
| 23 | + """ |
| 24 | + last_char = allele[-1] |
| 25 | + fields = allele.split(':') |
| 26 | + if last_char in expression_chars and len(fields) > n: |
| 27 | + return ':'.join(fields[0:n]) + last_char |
| 28 | + else: |
| 29 | + return ':'.join(fields[0:n]) |
| 30 | + |
| 31 | + |
| 32 | +def get_3field_allele(a: str) -> str: |
| 33 | + return get_n_field_allele(a, 3) |
| 34 | + |
| 35 | + |
| 36 | +def get_2field_allele(a: str) -> str: |
| 37 | + return get_n_field_allele(a, 2) |
| 38 | + |
| 39 | + |
| 40 | +def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version): |
| 41 | + if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']): |
| 42 | + dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group')) |
| 43 | + g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g')) |
| 44 | + lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg')) |
| 45 | + lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx')) |
| 46 | + return dup_g, g_group, lg_group, lgx_group |
| 47 | + |
| 48 | + ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt' |
| 49 | + df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna() |
| 50 | + |
| 51 | + df['A'] = df['A'].apply(lambda a: a.split('/')) |
| 52 | + df = df.explode('A') |
| 53 | + df['A'] = df['Locus'] + df['A'] |
| 54 | + df['G'] = df['Locus'] + df['G'] |
| 55 | + |
| 56 | + df['2d'] = df['A'].apply(get_2field_allele) |
| 57 | + df['3d'] = df['A'].apply(get_3field_allele) |
| 58 | + |
| 59 | + mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts() |
| 60 | + multiple_g_list = mg[mg > 1].reset_index()['index'].to_list() |
| 61 | + |
| 62 | + dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \ |
| 63 | + .drop_duplicates() \ |
| 64 | + .groupby('2d', as_index=True).agg("/".join) \ |
| 65 | + .to_dict()['G'] |
| 66 | + |
| 67 | + df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g") |
| 68 | + df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2])) |
| 69 | + |
| 70 | + # Creating dictionaries with mac_code->ARS group mapping |
| 71 | + df_g = pd.concat([ |
| 72 | + df[['2d', 'G']].rename(columns={'2d': 'A'}), |
| 73 | + df[['3d', 'G']].rename(columns={'3d': 'A'}), |
| 74 | + df[['A', 'G']] |
| 75 | + ], ignore_index=True) |
| 76 | + g_group = df_g.set_index('A')['G'].to_dict() |
| 77 | + |
| 78 | + df_lg = pd.concat([ |
| 79 | + df[['2d', 'lg']].rename(columns={'2d': 'A'}), |
| 80 | + df[['3d', 'lg']].rename(columns={'3d': 'A'}), |
| 81 | + df[['A', 'lg']] |
| 82 | + ]) |
| 83 | + lg_group = df_lg.set_index('A')['lg'].to_dict() |
| 84 | + |
| 85 | + df_lgx = pd.concat([ |
| 86 | + df[['2d', 'lgx']].rename(columns={'2d': 'A'}), |
| 87 | + df[['3d', 'lgx']].rename(columns={'3d': 'A'}), |
| 88 | + df[['A', 'lgx']] |
| 89 | + ]) |
| 90 | + lgx_group = df_lgx.set_index('A')['lgx'].to_dict() |
| 91 | + |
| 92 | + db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group')) |
| 93 | + db.save_dict(db_connection, table_name='g_group', dictionary=g_group, columns=('allele', 'g')) |
| 94 | + db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg')) |
| 95 | + db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx')) |
| 96 | + |
| 97 | + return dup_g, g_group, lg_group, lgx_group |
| 98 | + |
| 99 | + |
| 100 | +def generate_mac_codes(db_connection: sqlite3.Connection): |
| 101 | + """ |
| 102 | + MAC files come in 2 different versions: |
| 103 | +
|
| 104 | + Martin: when they’re printed, the first is better for encoding and the |
| 105 | + second is better for decoding. The entire list was maintained both as an |
| 106 | + excel spreadsheet and also as a sybase database table. The excel was the |
| 107 | + one that was printed and distributed. |
| 108 | +
|
| 109 | + **==> numer.v3.txt <==** |
| 110 | +
|
| 111 | + Sorted by the length and the the values in the list |
| 112 | + ``` |
| 113 | + "LAST UPDATED: 09/30/20" |
| 114 | + CODE SUBTYPE |
| 115 | +
|
| 116 | + AB 01/02 |
| 117 | + AC 01/03 |
| 118 | + AD 01/04 |
| 119 | + AE 01/05 |
| 120 | + AG 01/06 |
| 121 | + AH 01/07 |
| 122 | + AJ 01/08 |
| 123 | + ``` |
| 124 | +
|
| 125 | + **==> alpha.v3.txt <==** |
| 126 | +
|
| 127 | + Sorted by the code |
| 128 | +
|
| 129 | + ``` |
| 130 | + "LAST UPDATED: 10/01/20" |
| 131 | + * CODE SUBTYPE |
| 132 | +
|
| 133 | + AA 01/02/03/05 |
| 134 | + AB 01/02 |
| 135 | + AC 01/03 |
| 136 | + AD 01/04 |
| 137 | + AE 01/05 |
| 138 | + AF 01/09 |
| 139 | + AG 01/06 |
| 140 | + ``` |
| 141 | +
|
| 142 | + :param db_connection: |
| 143 | + :param data_dir: |
| 144 | + :return: |
| 145 | + """ |
| 146 | + mac_table_name = 'mac_codes' |
| 147 | + if not db.table_exists(db_connection, mac_table_name): |
| 148 | + # Load the MAC file to a DataFrame |
| 149 | + mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip' |
| 150 | + df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles']) |
| 151 | + # Create a dict from code to alleles |
| 152 | + mac = df_mac.set_index("Code")["Alleles"].to_dict() |
| 153 | + # Save the mac dict to db |
| 154 | + db.save_dict(db_connection, table_name=mac_table_name, dictionary=mac, columns=('code', 'alleles')) |
| 155 | + |
| 156 | + |
| 157 | +def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version): |
| 158 | + """ |
| 159 | + Checks to see if there's already an allele list file for the `imgt_version` |
| 160 | + in the `data_dir` directory. If not, will download the file and create |
| 161 | + a valid allele set and corresponding xx codes. |
| 162 | +
|
| 163 | + The format of the AlleleList file has a 6-line header with a header |
| 164 | + on the 7th line |
| 165 | + ``` |
| 166 | + # file: Allelelist.3290.txt |
| 167 | + # date: 2017-07-10 |
| 168 | + # version: IPD-IMGT/HLA 3.29.0 |
| 169 | + # origin: https://github.com/ANHIG/IMGTHLA/Allelelist.3290.txt |
| 170 | + # repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.3290.txt |
| 171 | + # author: WHO, Steven G. E. Marsh ([email protected]) |
| 172 | + AlleleID,Allele |
| 173 | + HLA00001,A*01:01:01:01 |
| 174 | + HLA02169,A*01:01:01:02N |
| 175 | + HLA14798,A*01:01:01:03 |
| 176 | + HLA15760,A*01:01:01:04 |
| 177 | + HLA16415,A*01:01:01:05 |
| 178 | + HLA16417,A*01:01:01:06 |
| 179 | + HLA16436,A*01:01:01:07 |
| 180 | + ``` |
| 181 | +
|
| 182 | + :param db_connection: Database connection to the sqlite database |
| 183 | + :param imgt_version: IMGT database version |
| 184 | + :return: None, updates self |
| 185 | + """ |
| 186 | + |
| 187 | + if db.table_exists(db_connection, 'alleles'): |
| 188 | + valid_alleles = db.load_set(db_connection, 'alleles') |
| 189 | + xx_codes = db.load_dict(db_connection, 'xx_codes', |
| 190 | + ('allele_1d', 'allele_list')) |
| 191 | + xx_codes = {k: v.split('/') for k, v in xx_codes.items()} |
| 192 | + return valid_alleles, xx_codes |
| 193 | + |
| 194 | + # Create a Pandas DataFrame from the mac_code list file |
| 195 | + # Skip the header (first 6 lines) and use only the Allele column |
| 196 | + if imgt_version == "Latest": |
| 197 | + allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.txt' |
| 198 | + else: |
| 199 | + allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{imgt_version}.txt' |
| 200 | + allele_df = pd.read_csv(allele_list_url, header=6, usecols=['Allele']) |
| 201 | + |
| 202 | + # Create a set of valid alleles |
| 203 | + # All 2-field, 3-field and the original Alleles are considered valid alleles |
| 204 | + allele_df['2d'] = allele_df['Allele'].apply(get_2field_allele) |
| 205 | + allele_df['3d'] = allele_df['Allele'].apply(get_3field_allele) |
| 206 | + valid_alleles = set(allele_df['Allele']). \ |
| 207 | + union(set(allele_df['2d'])). \ |
| 208 | + union(set(allele_df['3d'])) |
| 209 | + |
| 210 | + # Create xx_codes mapping from the unique alleles in 2-field column |
| 211 | + xx_df = pd.DataFrame(allele_df['2d'].unique(), columns=['Allele']) |
| 212 | + # Also create a first-field column |
| 213 | + xx_df['1d'] = xx_df['Allele'].apply(lambda x: x.split(":")[0]) |
| 214 | + # xx_codes maps a first field name to its 2 field expansion |
| 215 | + xx_codes = xx_df.groupby(['1d']) \ |
| 216 | + .apply(lambda x: list(x['Allele'])) \ |
| 217 | + .to_dict() |
| 218 | + |
| 219 | + # Update xx codes with broads and splits |
| 220 | + for broad, splits in broad_splits_mapping.items(): |
| 221 | + for split in splits: |
| 222 | + if broad in xx_codes: |
| 223 | + xx_codes[broad].extend(xx_codes[split]) |
| 224 | + else: |
| 225 | + xx_codes[broad] = xx_codes[split] |
| 226 | + |
| 227 | + # Save this version of the valid alleles and xx codes |
| 228 | + db.save_set(db_connection, 'alleles', valid_alleles, 'allele') |
| 229 | + flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()} |
| 230 | + db.save_dict(db_connection, 'xx_codes', flat_xx_codes, |
| 231 | + ('allele_1d', 'allele_list')) |
| 232 | + |
| 233 | + return valid_alleles, xx_codes |
0 commit comments