Skip to content

Commit e2358c4

Browse files
committed
Use sqlite3 database for data
- Move data generation code to `data_repository.py` - `db.py` saves/load tables of data, retrieve MAC code Offload MAC codes from memory to sqlite3 database (natively supported by Python) to reduce memory footprint. All MAC lookups happen through the db. The alleles and G group expansions are still held in memory. In addition, all generated data is saved as tables in the same database. This leads to one file for storing all reference data in a standard format.
1 parent b5f951c commit e2358c4

File tree

4 files changed

+532
-374
lines changed

4 files changed

+532
-374
lines changed

pyard/data_repository.py

Lines changed: 233 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,233 @@
1+
import sqlite3
2+
3+
import pandas as pd
4+
5+
from pyard import db
6+
from pyard.broad_splits import broad_splits_mapping
7+
8+
# GitHub URL where IMGT HLA files are downloaded.
9+
IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'
10+
11+
# List of expression characters
12+
expression_chars = ['N', 'Q', 'L', 'S']
13+
14+
15+
def get_n_field_allele(allele: str, n: int) -> str:
16+
"""
17+
Given an HLA allele of >= n field, return n field allele.
18+
Preserve the expression character if it exists
19+
20+
:param allele: Original allele
21+
:param n: n number of fields to reduce to
22+
:return: trimmed to n fields of the original allele
23+
"""
24+
last_char = allele[-1]
25+
fields = allele.split(':')
26+
if last_char in expression_chars and len(fields) > n:
27+
return ':'.join(fields[0:n]) + last_char
28+
else:
29+
return ':'.join(fields[0:n])
30+
31+
32+
def get_3field_allele(a: str) -> str:
33+
return get_n_field_allele(a, 3)
34+
35+
36+
def get_2field_allele(a: str) -> str:
37+
return get_n_field_allele(a, 2)
38+
39+
40+
def generate_ars_mapping(db_connection: sqlite3.Connection, imgt_version):
41+
if db.tables_exists(db_connection, ['dup_g', 'g_group', 'lg_group', 'lgx_group']):
42+
dup_g = db.load_dict(db_connection, table_name='dup_g', columns=('allele', 'g_group'))
43+
g_group = db.load_dict(db_connection, table_name='g_group', columns=('allele', 'g'))
44+
lg_group = db.load_dict(db_connection, table_name='lg_group', columns=('allele', 'lg'))
45+
lgx_group = db.load_dict(db_connection, table_name='lgx_group', columns=('allele', 'lgx'))
46+
return dup_g, g_group, lg_group, lgx_group
47+
48+
ars_url = f'{IMGT_HLA_URL}{imgt_version}/wmda/hla_nom_g.txt'
49+
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
50+
51+
df['A'] = df['A'].apply(lambda a: a.split('/'))
52+
df = df.explode('A')
53+
df['A'] = df['Locus'] + df['A']
54+
df['G'] = df['Locus'] + df['G']
55+
56+
df['2d'] = df['A'].apply(get_2field_allele)
57+
df['3d'] = df['A'].apply(get_3field_allele)
58+
59+
mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts()
60+
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list()
61+
62+
dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \
63+
.drop_duplicates() \
64+
.groupby('2d', as_index=True).agg("/".join) \
65+
.to_dict()['G']
66+
67+
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
68+
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))
69+
70+
# Creating dictionaries with mac_code->ARS group mapping
71+
df_g = pd.concat([
72+
df[['2d', 'G']].rename(columns={'2d': 'A'}),
73+
df[['3d', 'G']].rename(columns={'3d': 'A'}),
74+
df[['A', 'G']]
75+
], ignore_index=True)
76+
g_group = df_g.set_index('A')['G'].to_dict()
77+
78+
df_lg = pd.concat([
79+
df[['2d', 'lg']].rename(columns={'2d': 'A'}),
80+
df[['3d', 'lg']].rename(columns={'3d': 'A'}),
81+
df[['A', 'lg']]
82+
])
83+
lg_group = df_lg.set_index('A')['lg'].to_dict()
84+
85+
df_lgx = pd.concat([
86+
df[['2d', 'lgx']].rename(columns={'2d': 'A'}),
87+
df[['3d', 'lgx']].rename(columns={'3d': 'A'}),
88+
df[['A', 'lgx']]
89+
])
90+
lgx_group = df_lgx.set_index('A')['lgx'].to_dict()
91+
92+
db.save_dict(db_connection, table_name='dup_g', dictionary=dup_g, columns=('allele', 'g_group'))
93+
db.save_dict(db_connection, table_name='g_group', dictionary=g_group, columns=('allele', 'g'))
94+
db.save_dict(db_connection, table_name='lg_group', dictionary=lg_group, columns=('allele', 'lg'))
95+
db.save_dict(db_connection, table_name='lgx_group', dictionary=lgx_group, columns=('allele', 'lgx'))
96+
97+
return dup_g, g_group, lg_group, lgx_group
98+
99+
100+
def generate_mac_codes(db_connection: sqlite3.Connection):
101+
"""
102+
MAC files come in 2 different versions:
103+
104+
Martin: when they’re printed, the first is better for encoding and the
105+
second is better for decoding. The entire list was maintained both as an
106+
excel spreadsheet and also as a sybase database table. The excel was the
107+
one that was printed and distributed.
108+
109+
**==> numer.v3.txt <==**
110+
111+
Sorted by the length and the the values in the list
112+
```
113+
"LAST UPDATED: 09/30/20"
114+
CODE SUBTYPE
115+
116+
AB 01/02
117+
AC 01/03
118+
AD 01/04
119+
AE 01/05
120+
AG 01/06
121+
AH 01/07
122+
AJ 01/08
123+
```
124+
125+
**==> alpha.v3.txt <==**
126+
127+
Sorted by the code
128+
129+
```
130+
"LAST UPDATED: 10/01/20"
131+
* CODE SUBTYPE
132+
133+
AA 01/02/03/05
134+
AB 01/02
135+
AC 01/03
136+
AD 01/04
137+
AE 01/05
138+
AF 01/09
139+
AG 01/06
140+
```
141+
142+
:param db_connection:
143+
:param data_dir:
144+
:return:
145+
"""
146+
mac_table_name = 'mac_codes'
147+
if not db.table_exists(db_connection, mac_table_name):
148+
# Load the MAC file to a DataFrame
149+
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
150+
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles'])
151+
# Create a dict from code to alleles
152+
mac = df_mac.set_index("Code")["Alleles"].to_dict()
153+
# Save the mac dict to db
154+
db.save_dict(db_connection, table_name=mac_table_name, dictionary=mac, columns=('code', 'alleles'))
155+
156+
157+
def generate_alleles_and_xx_codes(db_connection: sqlite3.Connection, imgt_version):
158+
"""
159+
Checks to see if there's already an allele list file for the `imgt_version`
160+
in the `data_dir` directory. If not, will download the file and create
161+
a valid allele set and corresponding xx codes.
162+
163+
The format of the AlleleList file has a 6-line header with a header
164+
on the 7th line
165+
```
166+
# file: Allelelist.3290.txt
167+
# date: 2017-07-10
168+
# version: IPD-IMGT/HLA 3.29.0
169+
# origin: https://github.com/ANHIG/IMGTHLA/Allelelist.3290.txt
170+
# repository: https://raw.githubusercontent.com/ANHIG/IMGTHLA/Latest/Allelelist.3290.txt
171+
# author: WHO, Steven G. E. Marsh ([email protected])
172+
AlleleID,Allele
173+
HLA00001,A*01:01:01:01
174+
HLA02169,A*01:01:01:02N
175+
HLA14798,A*01:01:01:03
176+
HLA15760,A*01:01:01:04
177+
HLA16415,A*01:01:01:05
178+
HLA16417,A*01:01:01:06
179+
HLA16436,A*01:01:01:07
180+
```
181+
182+
:param db_connection: Database connection to the sqlite database
183+
:param imgt_version: IMGT database version
184+
:return: None, updates self
185+
"""
186+
187+
if db.table_exists(db_connection, 'alleles'):
188+
valid_alleles = db.load_set(db_connection, 'alleles')
189+
xx_codes = db.load_dict(db_connection, 'xx_codes',
190+
('allele_1d', 'allele_list'))
191+
xx_codes = {k: v.split('/') for k, v in xx_codes.items()}
192+
return valid_alleles, xx_codes
193+
194+
# Create a Pandas DataFrame from the mac_code list file
195+
# Skip the header (first 6 lines) and use only the Allele column
196+
if imgt_version == "Latest":
197+
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.txt'
198+
else:
199+
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{imgt_version}.txt'
200+
allele_df = pd.read_csv(allele_list_url, header=6, usecols=['Allele'])
201+
202+
# Create a set of valid alleles
203+
# All 2-field, 3-field and the original Alleles are considered valid alleles
204+
allele_df['2d'] = allele_df['Allele'].apply(get_2field_allele)
205+
allele_df['3d'] = allele_df['Allele'].apply(get_3field_allele)
206+
valid_alleles = set(allele_df['Allele']). \
207+
union(set(allele_df['2d'])). \
208+
union(set(allele_df['3d']))
209+
210+
# Create xx_codes mapping from the unique alleles in 2-field column
211+
xx_df = pd.DataFrame(allele_df['2d'].unique(), columns=['Allele'])
212+
# Also create a first-field column
213+
xx_df['1d'] = xx_df['Allele'].apply(lambda x: x.split(":")[0])
214+
# xx_codes maps a first field name to its 2 field expansion
215+
xx_codes = xx_df.groupby(['1d']) \
216+
.apply(lambda x: list(x['Allele'])) \
217+
.to_dict()
218+
219+
# Update xx codes with broads and splits
220+
for broad, splits in broad_splits_mapping.items():
221+
for split in splits:
222+
if broad in xx_codes:
223+
xx_codes[broad].extend(xx_codes[split])
224+
else:
225+
xx_codes[broad] = xx_codes[split]
226+
227+
# Save this version of the valid alleles and xx codes
228+
db.save_set(db_connection, 'alleles', valid_alleles, 'allele')
229+
flat_xx_codes = {k: '/'.join(v) for k, v in xx_codes.items()}
230+
db.save_dict(db_connection, 'xx_codes', flat_xx_codes,
231+
('allele_1d', 'allele_list'))
232+
233+
return valid_alleles, xx_codes

0 commit comments

Comments
 (0)