Skip to content

Commit d91e329

Browse files
committed
Refactor G/lg/lgx generation
- Don't save temporary files. Load directly from URL - Refactor complex lambdas into functions - Use simpler column selections in pandas - Save the mapping for quick loading next time
1 parent 3c57bb7 commit d91e329

File tree

6 files changed

+96
-371
lines changed

6 files changed

+96
-371
lines changed

README.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,7 @@ Example
7676
ard.redux(allele, 'lgx')
7777
# 'A*01:01'
7878
79-
ard_gl = ard.redux_gl("A*01:01/A*01:01N+A*02:AB^B*07:02+B*07:AB", "G")
80-
# >>> ard_gl
79+
ard.redux_gl("A*01:01/A*01:01N+A*02:AB^B*07:02+B*07:AB", "G")
8180
# 'B*07:02:01G+B*07:02:01G^A*01:01:01G+A*02:01:01G/A*02:02'
8281
8382

pyard/base_model_.py

Lines changed: 0 additions & 98 deletions
This file was deleted.

pyard/pyard.py

Lines changed: 54 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,6 @@
2626
import pathlib
2727
import pickle
2828
import re
29-
import urllib.request
3029
from functools import partial
3130
from operator import is_not
3231
from typing import Dict
@@ -35,7 +34,6 @@
3534

3635
from .broad_splits import broad_splits_mapping
3736
from .smart_sort import smart_sort_comparator
38-
from .util import pandas_explode
3937

4038
# The GitHub URL where IMGT HLA files are downloaded.
4139
IMGT_HLA_URL = 'https://raw.githubusercontent.com/ANHIG/IMGTHLA/'
@@ -78,6 +76,7 @@ def get_2field_allele(a: str) -> str:
7876

7977
class ARD(object):
8078
""" ARD reduction for HLA """
79+
8180
def __init__(self, dbversion: str = 'Latest',
8281
load_mac_file: bool = True,
8382
verbose: bool = False,
@@ -90,92 +89,77 @@ def __init__(self, dbversion: str = 'Latest',
9089
self._load_mac_file = load_mac_file
9190
self._remove_invalid = remove_invalid
9291

93-
94-
# TODO: add check for valid_alleles ARD type
95-
# TODO: add check for valid_alleles db version
96-
9792
# Set data directory where all the downloaded files will go
9893
if data_dir is None:
99-
data_dir = os.path.dirname(__file__)
100-
else:
101-
pathlib.Path(data_dir).mkdir(exist_ok=True)
94+
data_dir = pathlib.Path.home() / ".pyard"
10295

103-
ars_url = IMGT_HLA_URL + dbversion + '/wmda/hla_nom_g.txt'
104-
ars_file = data_dir + '/hla_nom_g.' + str(dbversion) + ".txt"
105-
# Downloading ARS file
106-
if not os.path.isfile(ars_file):
107-
if verbose:
108-
logging.info("Downloading " + str(dbversion) + " ARD file")
109-
urllib.request.urlretrieve(ars_url, ars_file)
96+
data_dir = f'{data_dir}/{dbversion}'
97+
pathlib.Path(data_dir).mkdir(parents=True, exist_ok=True)
11098

11199
# Load MAC codes
112100
if load_mac_file:
113101
self.generate_mac_codes(data_dir)
114102
# Load Alleles and XX Codes
115103
self.generate_alleles_and_xxcodes(dbversion, data_dir)
104+
# Load ARS mappings
105+
self.generate_ars_mapping(data_dir)
116106

117-
# Loading ARS file into pandas
118-
# TODO: Make skip dynamic in case the files are not consistent
119-
df = pd.read_csv(ars_file, skiprows=6,
120-
names=["Locus", "A", "G"], sep=";").dropna()
107+
def generate_ars_mapping(self, data_dir):
121108

122-
df['Locus'] = df['Locus'].apply(lambda l: l.split("*")[0])
123-
df['A'] = df[['Locus', 'A']].apply(lambda row: [row['Locus'] + "*" + a
124-
for a in
125-
row['A'].split("/")
126-
],
127-
axis=1)
128-
df['G'] = df[['Locus', 'G']].apply(lambda row: "*".join([row['Locus'],
129-
row['G']]),
130-
axis=1)
109+
mapping_file = f'{data_dir}/ars_mapping.pickle'
110+
if os.path.isfile(mapping_file):
111+
with open(mapping_file, 'rb') as load_file:
112+
ars_mapping = pickle.load(load_file)
113+
self._G, self._lg, self._lgx, self.dup_g = ars_mapping
114+
return
131115

132-
df = pandas_explode(df, 'A')
116+
ars_url = f'{IMGT_HLA_URL}{self._dbversion}/wmda/hla_nom_g.txt'
117+
df = pd.read_csv(ars_url, skiprows=6, names=["Locus", "A", "G"], sep=";").dropna()
118+
119+
df['A'] = df['A'].apply(lambda a: a.split('/'))
120+
df = df.explode('A')
121+
df['A'] = df['Locus'] + df['A']
122+
df['G'] = df['Locus'] + df['G']
133123

134124
df['2d'] = df['A'].apply(get_2field_allele)
135125
df['3d'] = df['A'].apply(get_3field_allele)
136126

137-
df_values = df.drop_duplicates(['2d', 'G'])['2d'] \
138-
.value_counts().reset_index() \
139-
.sort_values(by='2d', ascending=False)
140-
multiple_Glist = df_values[df_values['2d'] > 1]['index'].tolist()
141-
self.dup_g = df[df['2d'].isin(multiple_Glist)][['G', '2d']] \
127+
mg = df.drop_duplicates(['2d', 'G'])['2d'].value_counts()
128+
multiple_g_list = mg[mg > 1].reset_index()['index'].to_list()
129+
130+
self.dup_g = df[df['2d'].isin(multiple_g_list)][['G', '2d']] \
142131
.drop_duplicates() \
143132
.groupby('2d', as_index=True).agg("/".join) \
144133
.to_dict()['G']
145134

146-
df['lg'] = df['G'].apply(lambda a:
147-
":".join(a.split(":")[0:2]) + "g")
148-
149-
df['lgx'] = df['G'].apply(lambda a:
150-
":".join(a.split(":")[0:2]))
135+
df['lg'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]) + "g")
136+
df['lgx'] = df['G'].apply(lambda a: ":".join(a.split(":")[0:2]))
151137

152138
# Creating dictionaries with allele->ARS group mapping
153-
self._G = pd.concat([df.drop(['A', 'lg', 'lgx', '3d'], axis=1)
154-
.rename(index=str,
155-
columns={"2d": "A"})[['A', 'G']],
156-
df.drop(['A', 'lg', 'lgx', '2d'], axis=1)
157-
.rename(index=str,
158-
columns={"3d": "A"})[['A', 'G']],
159-
df[['A', 'G']]],
160-
ignore_index=True).set_index('A').to_dict()['G']
161-
162-
self._lg = pd.concat([df.drop(['A', 'G', 'lgx', '3d'], axis=1)
163-
.rename(index=str,
164-
columns={"2d": "A"})[['A', 'lg']],
165-
df.drop(['A', 'G', 'lgx', '2d'], axis=1)
166-
.rename(index=str,
167-
columns={"3d": "A"})[['A', 'lg']],
168-
df[['A', 'lg']]],
169-
ignore_index=True).set_index('A').to_dict()['lg']
170-
171-
self._lgx = pd.concat([df.drop(['A', 'lg', 'G', '3d'], axis=1)
172-
.rename(index=str,
173-
columns={"2d": "A"})[['A', 'lgx']],
174-
df.drop(['A', 'lg', 'G', '2d'], axis=1)
175-
.rename(index=str,
176-
columns={"3d": "A"})[['A', 'lgx']],
177-
df[['A', 'lgx']]],
178-
ignore_index=True).set_index('A').to_dict()['lgx']
139+
df_G = pd.concat([
140+
df[['2d', 'G']].rename(columns={'2d': 'A'}),
141+
df[['3d', 'G']].rename(columns={'3d': 'A'}),
142+
df[['A', 'G']]
143+
], ignore_index=True)
144+
self._G = df_G.set_index('A')['G'].to_dict()
145+
146+
df_lg = pd.concat([
147+
df[['2d', 'lg']].rename(columns={'2d': 'A'}),
148+
df[['3d', 'lg']].rename(columns={'3d': 'A'}),
149+
df[['A', 'lg']]
150+
])
151+
self._lg = df_lg.set_index('A')['lg'].to_dict()
152+
153+
df_lgx = pd.concat([
154+
df[['2d', 'lgx']].rename(columns={'2d': 'A'}),
155+
df[['3d', 'lgx']].rename(columns={'3d': 'A'}),
156+
df[['A', 'lgx']]
157+
])
158+
self._lgx = df_lgx.set_index('A')['lgx'].to_dict()
159+
160+
ars_mapping = (self._G, self._lg, self._lgx, self.dup_g)
161+
with open(mapping_file, 'wb') as save_file:
162+
pickle.dump(ars_mapping, save_file, protocol=pickle.HIGHEST_PROTOCOL)
179163

180164
def generate_mac_codes(self, data_dir):
181165
"""
@@ -287,7 +271,10 @@ def generate_alleles_and_xxcodes(self, dbversion: str, data_dir: str) -> None:
287271

288272
# Create a Pandas DataFrame from the allele list file
289273
# Skip the header (first 6 lines) and use only the Allele
290-
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{dbversion}.txt'
274+
if dbversion == "Latest":
275+
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.txt'
276+
else:
277+
allele_list_url = f'{IMGT_HLA_URL}Latest/Allelelist.{dbversion}.txt'
291278
allele_df = pd.read_csv(allele_list_url, header=6, usecols=['Allele'])
292279

293280
# Create a set of valid alleles

0 commit comments

Comments
 (0)