Skip to content

Commit e06cea7

Browse files
authored
Merge pull request #57 from pbashyal-nmdp/refactor_data_to_sqlite
Refactor reference data to db
2 parents b5f951c + 916d1c2 commit e06cea7

File tree

7 files changed

+532
-373
lines changed

7 files changed

+532
-373
lines changed

pyard/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,4 +24,4 @@
2424
from .pyard import ARD
2525

2626
__author__ = """NMDP Bioinformatics"""
27-
__version__ = '0.2.0'
27+
__version__ = '0.3.0'

pyard/data_repository.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
import sqlite3
2+
3+
import pandas as pd
4+
5+
from pyard import db
6+
from pyard.broad_splits import broad_splits_mapping
7+
8+
# GitHub URL where IMGT HLA files are downloaded.
9+
IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'
10+
11+
# List of expression characters
12+
expression_chars = ['N', 'Q', 'L', 'S']
13+
14+
15+
def get_n_field_allele(allele: str, n: int) -> str:
16+
"""
17+
Given an HLA allele of >= n field, return n field allele.
18+
Preserve the expression character if it exists
19+
20+
:param allele: Original allele
21+
:param n: n number of fields to reduce to
22+
:return: trimmed to n fields of the original allele
23+
"""
24+
last_char = allele[-1]
25+
fields = allele.split(':')
26+
if last_char in expression_chars and len(fields) > n:
27+
return ':'.join(fields[0:n]) + last_char
28+
else:
29+
return ':'.join(fields[0:n])
30+
31+
32+
def get_3field_allele(a: str) -> str:
33+
return get_n_field_allele(a, 3)
34+
35+
36+
def get_2field_allele(a: str) -> str:
37+
return get_n_field_allele(a, 2)
38+
39+
40+
def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
41+
if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']):
42+
dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group'))
43+
g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g'))
44+
lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg'))
45+
lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx'))
46+
return dup_g, g_group, lg_group, lgx_group
47+
48+
ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt'
49+
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
50+
51+
df['A'] = df['A'].apply(lambda a: a.split('/'))
52+
df = df.explode('A')
53+
df['A'] = df['Locus'] + df['A']
54+
df['G'] = df['Locus'] + df['G']
55+
56+
df['2d'] = df['A'].apply(get_2field_allele)
57+
df['3d'] = df['A'].apply(get_3field_allele)
58+
59+
mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts()
60+
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list()
61+
62+
dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \
63+
.drop_duplicates() \
64+
.groupby('2d', as_index=True).agg("/".join) \
65+
.to_dict()['G']
66+
67+
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
68+
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))
69+
70+
# Creating dictionaries with mac_code->ARS group mapping
71+
df_g = pd.concat([
72+
df[['2d', 'G']].rename(columns={'2d': 'A'}),
73+
df[['3d', 'G']].rename(columns={'3d': 'A'}),
74+
df[['A', 'G']]
75+
], ignore_index=True)
76+
g_group = df_g.set_index('A')['G'].to_dict()
77+
78+
df_lg = pd.concat([
79+
df[['2d', 'lg']].rename(columns={'2d': 'A'}),
80+
df[['3d', 'lg']].rename(columns={'3d': 'A'}),
81+
df[['A', 'lg']]
82+
])
83+
lg_group = df_lg.set_index('A')['lg'].to_dict()
84+
85+
df_lgx = pd.concat([
86+
df[['2d', 'lgx']].rename(columns={'2d': 'A'}),
87+
df[['3d', 'lgx']].rename(columns={'3d': 'A'}),
88+
df[['A', 'lgx']]
89+
])
90+
lgx_group = df_lgx.set_index('A')['lgx'].to_dict()
91+
92+
db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group'))
93+
db.save_dict(db_connection, table_name='g_group', dictionary=g_group, columns=('allele', 'g'))
94+
db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg'))
95+
db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx'))
96+
97+
return dup_g, g_group, lg_group, lgx_group
98+
99+
100+
def generate_mac_codes(db_connection: sqlite3.Connection):
101+
"""
102+
MAC files come in 2 different versions:
103+
104+
Martin: when they’re printed, the first is better for encoding and the
105+
second is better for decoding. The entire list was maintained both as an
106+
excel spreadsheet and also as a sybase database table. The excel was the
107+
one that was printed and distributed.
108+
109+
**==> numer.v3.txt <==**
110+
111+
Sorted by the length and the the values in the list
112+
```
113+
"LAST UPDATED: 09/30/20"
114+
CODE SUBTYPE
115+
116+
AB 01/02
117+
AC 01/03
118+
AD 01/04
119+
AE 01/05
120+
AG 01/06
121+
AH 01/07
122+
AJ 01/08
123+
```
124+
125+
**==> alpha.v3.txt <==**
126+
127+
Sorted by the code
128+
129+
```
130+
"LAST UPDATED: 10/01/20"
131+
* CODE SUBTYPE
132+
133+
AA 01/02/03/05
134+
AB 01/02
135+
AC 01/03
136+
AD 01/04
137+
AE 01/05
138+
AF 01/09
139+
AG 01/06
140+
```
141+
142+
:param db_connection:
143+
:param data_dir:
144+
:return:
145+
"""
146+
mac_table_name = 'mac_codes'
147+
if not db.table_exists(db_connection, mac_table_name):
148+
# Load the MAC file to a DataFrame
149+
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
150+
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles'])
151+
# Create a dict from code to alleles
152+
mac = df_mac.set_index("Code")["Alleles"].to_dict()
153+
# Save the mac dict to db
154+
db.save_dict(db_connection, table_name=mac_table_name, dictionary=mac, columns=('code', 'alleles'))
155+
156+
157+
def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
158+
"""
159+
Checks to see if there's already an allele list file for the `imgt_version`
160+
in the `data_dir` directory. If not, will download the file and create
161+
a valid allele set and corresponding xx codes.
162+
163+
The format of the AlleleList file has a 6-line header with a header
164+
on the 7th line
165+
```
166+
# file: Allelelist.3290.txt
167+
# date: 2017-07-10
168+
# version: IPD-IMGT/HLA 3.29.0
169+
# origin: https://github.com/ANHIG/IMGTHLA/Allelelist.3290.txt
170+
# repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.3290.txt
171+
# author: WHO, Steven G. E. Marsh ([email protected])
172+
AlleleID,Allele
173+
HLA00001,A*01:01:01:01
174+
HLA02169,A*01:01:01:02N
175+
HLA14798,A*01:01:01:03
176+
HLA15760,A*01:01:01:04
177+
HLA16415,A*01:01:01:05
178+
HLA16417,A*01:01:01:06
179+
HLA16436,A*01:01:01:07
180+
```
181+
182+
:param db_connection: Database connection to the sqlite database
183+
:param imgt_version: IMGT database version
184+
:return: None, updates self
185+
"""
186+
187+
if db.table_exists(db_connection, 'alleles'):
188+
valid_alleles = db.load_set(db_connection, 'alleles')
189+
xx_codes = db.load_dict(db_connection, 'xx_codes',
190+
('allele_1d', 'allele_list'))
191+
xx_codes = {k: v.split('/') for k, v in xx_codes.items()}
192+
return valid_alleles, xx_codes
193+
194+
# Create a Pandas DataFrame from the mac_code list file
195+
# Skip the header (first 6 lines) and use only the Allele column
196+
if imgt_version == "Latest":
197+
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.txt'
198+
else:
199+
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{imgt_version}.txt'
200+
allele_df = pd.read_csv(allele_list_url, header=6, usecols=['Allele'])
201+
202+
# Create a set of valid alleles
203+
# All 2-field, 3-field and the original Alleles are considered valid alleles
204+
allele_df['2d'] = allele_df['Allele'].apply(get_2field_allele)
205+
allele_df['3d'] = allele_df['Allele'].apply(get_3field_allele)
206+
valid_alleles = set(allele_df['Allele']). \
207+
union(set(allele_df['2d'])). \
208+
union(set(allele_df['3d']))
209+
210+
# Create xx_codes mapping from the unique alleles in 2-field column
211+
xx_df = pd.DataFrame(allele_df['2d'].unique(), columns=['Allele'])
212+
# Also create a first-field column
213+
xx_df['1d'] = xx_df['Allele'].apply(lambda x: x.split(":")[0])
214+
# xx_codes maps a first field name to its 2 field expansion
215+
xx_codes = xx_df.groupby(['1d']) \
216+
.apply(lambda x: list(x['Allele'])) \
217+
.to_dict()
218+
219+
# Update xx codes with broads and splits
220+
for broad, splits in broad_splits_mapping.items():
221+
for split in splits:
222+
if broad in xx_codes:
223+
xx_codes[broad].extend(xx_codes[split])
224+
else:
225+
xx_codes[broad] = xx_codes[split]
226+
227+
# Save this version of the valid alleles and xx codes
228+
db.save_set(db_connection, 'alleles', valid_alleles, 'allele')
229+
flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()}
230+
db.save_dict(db_connection, 'xx_codes', flat_xx_codes,
231+
('allele_1d', 'allele_list'))
232+
233+
return valid_alleles, xx_codes

0 commit comments

Comments
 (0)