Skip to content

Commit 3c57bb7

Browse files
committed
Refactor MAC Codes file
- Don't save temporary zip and css files - Don't expand the individual alleles for codes to dict. Read the zip file into a dictionary without any temp files. Saves memory/disk space. ``` (venv) --- /tmp/py-ard » ls -lah mac.pickle* -rw-r--r-- 1 pbashyal wheel 278M Oct 6 10:19 mac.pickle -rw-r--r-- 1 pbashyal wheel 365M Sep 30 14:45 mac.pickle-old ``` - Refactor complex lambdas into functions # Notes When reviewing MAC code. ## MAC file ```python mac_file = data_dir + "/mac.txt" ``` File: 'mac.txt' 2 Different Versions: > when they’re printed, the first is better for encoding and the second is better for decoding The entire list was maintained both as an excel spreadsheet and also as a sybase database table. The excel was the one that was printed and distributed and it was rife with typos **==> numer.v3.txt <==** Sorted by the length and the the values in the list ``` "LAST UPDATED: 09/30/20" CODE SUBTYPE AB 01/02 AC 01/03 AD 01/04 AE 01/05 AG 01/06 AH 01/07 AJ 01/08 ``` **==> alpha.v3.txt <==** Sorted by the code ``` "LAST UPDATED: 10/01/20" * CODE SUBTYPE AA 01/02/03/05 AB 01/02 AC 01/03 AD 01/04 AE 01/05 AF 01/09 AG 01/06 ``` Function `all_macs` downloads the `https://hml.nmdp.org/mac/files/numer.v3.zip` to file `numeric.v3.zip`, unzips it to `out_file = data_dir + "/numer.v3.txt"`. The first 3 lines are skipped. The rest is turned into a pandas DataFrame. ``` Code Alleles 0 AB 01/02 1 AC 01/03 2 AD 01/04 3 AE 01/05 4 AG 01/06 ``` And written out as `'/tmp/3290/mac.txt'` as a CSV file 851603 lines long. The `Alleles` column is expanded by splitting on `/` ``` Code Alleles 0 AB [01, 02] 1 AC [01, 03] 2 AD [01, 04] 3 AE [01, 05] 4 AG [01, 06] ... ... ... 9995 ABTVE [02, 110, 140] 9996 AUYAN [02, 110, 145] 9997 AAFFK [02, 110, 146] 9998 ACTAX [02, 110, 176] 9999 CKBTE [02, 110, 308] ``` Comments: No need to download the zip file, save it to an uncompress format. Just read the zip file into a dictionary without any temp files. ```python import pandas as pd url='https://hml.nmdp.org/mac/files/numer.v3.zip' df_mac = pd.read_csv(url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles']) mac_dict = df_mac.set_index("Code")["Alleles"].to_dict('index') ```
1 parent 52d90e1 commit 3c57bb7

File tree

3 files changed

+90
-82
lines changed

3 files changed

+90
-82
lines changed

pyard/pyard.py

Lines changed: 81 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@
3535

3636
from .broad_splits import broad_splits_mapping
3737
from .smart_sort import smart_sort_comparator
38-
from .util import all_macs
3938
from .util import pandas_explode
4039

4140
# The GitHub URL where IMGT HLA files are downloaded.
@@ -79,17 +78,12 @@ def get_2field_allele(a: str) -> str:
7978

8079
class ARD(object):
8180
""" ARD reduction for HLA """
82-
8381
def __init__(self, dbversion: str = 'Latest',
8482
load_mac_file: bool = True,
8583
verbose: bool = False,
8684
remove_invalid: bool = True,
8785
data_dir: str = None):
88-
"""
89-
ARD -
90-
:param dbversion:
91-
:type dbversion: str
92-
"""
86+
9387
self.mac = {}
9488
self._verbose = verbose
9589
self._dbversion = dbversion
@@ -107,33 +101,17 @@ def __init__(self, dbversion: str = 'Latest',
107101
pathlib.Path(data_dir).mkdir(exist_ok=True)
108102

109103
ars_url = IMGT_HLA_URL + dbversion + '/wmda/hla_nom_g.txt'
110-
111104
ars_file = data_dir + '/hla_nom_g.' + str(dbversion) + ".txt"
112-
mac_file = data_dir + "/mac.txt"
113-
mac_pickle = data_dir + "/mac.pickle"
114-
115105
# Downloading ARS file
116106
if not os.path.isfile(ars_file):
117107
if verbose:
118108
logging.info("Downloading " + str(dbversion) + " ARD file")
119109
urllib.request.urlretrieve(ars_url, ars_file)
120110

121-
# Downloading MAC file
111+
# Load MAC codes
122112
if load_mac_file:
123-
if not os.path.isfile(mac_pickle):
124-
if verbose:
125-
logging.info("Downloading MAC file")
126-
self.mac = all_macs(mac_file, data_dir=data_dir)
127-
128-
# Writing dict to pickle file
129-
with open(mac_pickle, 'wb') as handle:
130-
pickle.dump(self.mac, handle, protocol=pickle.HIGHEST_PROTOCOL)
131-
else:
132-
if verbose:
133-
logging.info("Loading MAC file")
134-
with open(mac_pickle, 'rb') as handle:
135-
self.mac = pickle.load(handle)
136-
113+
self.generate_mac_codes(data_dir)
114+
# Load Alleles and XX Codes
137115
self.generate_alleles_and_xxcodes(dbversion, data_dir)
138116

139117
# Loading ARS file into pandas
@@ -199,6 +177,71 @@ def __init__(self, dbversion: str = 'Latest',
199177
df[['A', 'lgx']]],
200178
ignore_index=True).set_index('A').to_dict()['lgx']
201179

180+
def generate_mac_codes(self, data_dir):
181+
"""
182+
MAC files come in 2 different versions:
183+
184+
Martin: when they’re printed, the first is better for encoding and the
185+
second is better for decoding. The entire list was maintained both as an
186+
excel spreadsheet and also as a sybase database table. The excel was the
187+
one that was printed and distributed.
188+
189+
**==> numer.v3.txt <==**
190+
191+
Sorted by the length and the the values in the list
192+
```
193+
"LAST UPDATED: 09/30/20"
194+
CODE SUBTYPE
195+
196+
AB 01/02
197+
AC 01/03
198+
AD 01/04
199+
AE 01/05
200+
AG 01/06
201+
AH 01/07
202+
AJ 01/08
203+
```
204+
205+
**==> alpha.v3.txt <==**
206+
207+
Sorted by the code
208+
209+
```
210+
"LAST UPDATED: 10/01/20"
211+
* CODE SUBTYPE
212+
213+
AA 01/02/03/05
214+
AB 01/02
215+
AC 01/03
216+
AD 01/04
217+
AE 01/05
218+
AF 01/09
219+
AG 01/06
220+
```
221+
222+
:param data_dir:
223+
:return:
224+
"""
225+
226+
mac_pickle = f'{data_dir}/mac.pickle'
227+
228+
if not os.path.isfile(mac_pickle):
229+
if self.verbose:
230+
logging.info("Downloading MAC file")
231+
# Load the MAC file to a DataFrame
232+
mac_url = 'https://hml.nmdp.org/mac/files/numer.v3.zip'
233+
df_mac = pd.read_csv(mac_url, sep='\t', compression='zip', skiprows=3, names=['Code', 'Alleles'])
234+
self.mac = df_mac.set_index("Code")["Alleles"].to_dict()
235+
236+
# Writing dict to pickle file
237+
with open(mac_pickle, 'wb') as save_file:
238+
pickle.dump(self.mac, save_file, protocol=pickle.HIGHEST_PROTOCOL)
239+
else:
240+
if self.verbose:
241+
logging.info("Loading MAC file")
242+
with open(mac_pickle, 'rb') as load_file:
243+
self.mac = pickle.load(load_file)
244+
202245
def generate_alleles_and_xxcodes(self, dbversion: str, data_dir: str) -> None:
203246
"""
204247
Checks to see if there's already an allele list file for the `dbversion`
@@ -236,7 +279,6 @@ def generate_alleles_and_xxcodes(self, dbversion: str, data_dir: str) -> None:
236279
# then reload the files without re-downloading
237280
if pathlib.Path(allele_file).exists() and \
238281
pathlib.Path(xx_codes_file).exists():
239-
print("Loading from file.")
240282
with open(allele_file, 'rb') as load_file:
241283
self.valid_alleles = pickle.load(load_file)
242284
with open(xx_codes_file, 'rb') as load_file:
@@ -398,7 +440,7 @@ def redux(self, allele: str, ars_type: str) -> str:
398440
return ':'.join(allele.split(':')[0:2])
399441
else:
400442
if self.remove_invalid:
401-
if allele in self.valid_alleles:
443+
if self._is_valid_allele(allele):
402444
return allele
403445
else:
404446
return ''
@@ -446,31 +488,28 @@ def redux_gl(self, glstring: str, redux_type: str) -> str:
446488
# handle XX codes
447489
# test that they are valid_alleles
448490
if (is_mac(glstring) and glstring.split(":")[1] == "XX") and loc_name in self.xxcodes:
449-
loc, n = loc_name.split("*")
450491
return self.redux_gl(
451492
"/".join(sorted(self.xxcodes[loc_name], key=functools.cmp_to_key(smart_sort_comparator))), redux_type)
452493

453494
if is_mac(glstring) and code in self.mac:
454495
if HLA_regex.search(glstring):
455496
hla, allele_name = glstring.split("-")
456497
loc_name, code = allele_name.split(":")
457-
alleles = self.get_alleles(code, loc_name)
498+
alleles = self._get_alleles(code, loc_name)
458499
return self.redux_gl(
459500
"/".join(sorted(["HLA-" + a for a in alleles], key=functools.cmp_to_key(smart_sort_comparator))),
460501
redux_type)
461502
else:
462-
alleles = self.get_alleles(code, loc_name)
503+
alleles = self._get_alleles(code, loc_name)
463504
return self.redux_gl("/".join(sorted(alleles, key=functools.cmp_to_key(smart_sort_comparator))),
464505
redux_type)
465506
return self.redux(glstring, redux_type)
466507

467-
def get_alleles(self, code, loc_name):
468-
loc, n = loc_name.split("*")
469-
alleles = list(filter(lambda a: a in self.valid_alleles,
470-
[loc_name + ":" + a if len(a) <= 3
471-
else loc + "*" + a
472-
for a in self.mac[code]['Alleles']]))
473-
return alleles
508+
def _is_valid_allele(self, allele):
509+
return allele in self.valid_alleles
510+
511+
def _get_alleles(self, code, loc_name):
512+
return filter(self._is_valid_allele, [f'{loc_name}:{a}' for a in self.mac[code].split('/')])
474513

475514
def isvalid(self, allele: str) -> bool:
476515
"""
@@ -492,7 +531,7 @@ def isvalid(self, allele: str) -> bool:
492531
if HLA_regex.search(allele):
493532
# remove 'HLA-' prefix
494533
allele = allele[4:]
495-
return allele in self.valid_alleles
534+
return self._is_valid_allele(allele)
496535
return True
497536

498537
def isvalid_gl(self, glstring: str) -> bool:
@@ -529,12 +568,8 @@ def mac_toG(self, allele: str) -> str:
529568
:rtype: str
530569
"""
531570
loc_name, code = allele.split(":")
532-
loc, n = loc_name.split("*")
533571
if code in self.mac:
534-
alleles = list(filter(lambda a: a in self.valid_alleles,
535-
[loc_name + ":" + a if len(a) <= 3
536-
else loc + "*" + a
537-
for a in self.mac[code]['Alleles']]))
572+
alleles = self._get_alleles(code, loc_name)
538573
group = list(filter(partial(is_not, None),
539574
set([self.toG(allele=a)
540575
for a in alleles])))
@@ -565,7 +600,7 @@ def toG(self, allele: str) -> str:
565600

566601
def expand_mac(self, allele: str):
567602
"""
568-
Exapnds mac codes
603+
Expands mac codes
569604
570605
:param allele: An HLA allele.
571606
:type: str
@@ -575,13 +610,9 @@ def expand_mac(self, allele: str):
575610
loc_name, code = allele.split(":")
576611
loc, n = loc_name.split("*")
577612
if len(loc.split("-")) == 2:
578-
loc = loc.split("-")[1]
579613
loc_name = loc_name.split("-")[1]
580614

581615
if code in self.mac:
582-
return list(filter(lambda a: a in self.valid_alleles,
583-
[loc_name + ":" + a if len(a) <= 3
584-
else loc + "*" + a
585-
for a in self.mac[code]['Alleles']]))
616+
return self._get_alleles(code, loc_name)
586617
else:
587618
return ''

pyard/util.py

Lines changed: 0 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -22,35 +22,12 @@
2222
# > http://www.opensource.org/licenses/lgpl-license.php
2323
#
2424
import copy
25-
import re
26-
import urllib.request
27-
import zipfile
2825
from datetime import datetime, date
2926

3027
import pandas as pd
3128
from six import integer_types, iteritems
3229

3330

34-
def all_macs(csv_file, data_dir, url='https://hml.nmdp.org/mac/files/numer.v3.zip'):
35-
urllib.request.urlretrieve(url, 'numeric.v3.zip')
36-
zip_ref = zipfile.ZipFile('numeric.v3.zip', 'r')
37-
zip_ref.extractall(data_dir)
38-
zip_ref.close()
39-
data = []
40-
out_file = data_dir + "/numer.v3.txt"
41-
with open(out_file, 'r') as f:
42-
for line in f:
43-
line = line.rstrip()
44-
if re.search("^\D", line) and not re.search("CODE", line) and not re.search("LAST", line):
45-
data.append(line.split("\t"))
46-
f.close()
47-
df = pd.DataFrame(data, columns=['Code', 'Alleles'])
48-
df.to_csv(csv_file, header=True, index=False)
49-
df['Alleles'] = df['Alleles'].apply(lambda x: x.split("/"))
50-
mac_dict = df.set_index("Code").to_dict('index')
51-
return mac_dict
52-
53-
5431
def pandas_explode(df, column_to_explode):
5532
"""
5633
Similar to Hive's EXPLODE function, take a column with iterable elements, and flatten the iterable to one element

tests/test_pyard.py

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -46,15 +46,15 @@ def setUp(self):
4646
self.assertIsInstance(self.ard, ARD)
4747

4848
def test_no_mac(self):
49-
self.ard_no_mac = ARD(self.db_version, data_dir='/tmp/3290', load_mac_file=False)
50-
self.assertIsInstance(self.ard_no_mac, ARD)
51-
self.assertEqual(len(self.ard_no_mac.mac.keys()), 0)
52-
self.assertEqual(self.ard_no_mac.redux("A*01:01:01", 'G'), "A*01:01:01G")
53-
self.assertEqual(self.ard_no_mac.redux("A*01:01:01", 'lg'), "A*01:01g")
54-
self.assertEqual(self.ard_no_mac.redux("A*01:01:01", 'lgx'), "A*01:01")
55-
self.assertEqual(self.ard_no_mac.redux("HLA-A*01:01:01", 'G'), "HLA-A*01:01:01G")
56-
self.assertEqual(self.ard_no_mac.redux("HLA-A*01:01:01", 'lg'), "HLA-A*01:01g")
57-
self.assertEqual(self.ard_no_mac.redux("HLA-A*01:01:01", 'lgx'), "HLA-A*01:01")
49+
ard_no_mac = ARD(self.db_version, data_dir='/tmp/3290', load_mac_file=False)
50+
self.assertIsInstance(ard_no_mac, ARD)
51+
self.assertEqual(len(ard_no_mac.mac.keys()), 0)
52+
self.assertEqual(ard_no_mac.redux("A*01:01:01", 'G'), "A*01:01:01G")
53+
self.assertEqual(ard_no_mac.redux("A*01:01:01", 'lg'), "A*01:01g")
54+
self.assertEqual(ard_no_mac.redux("A*01:01:01", 'lgx'), "A*01:01")
55+
self.assertEqual(ard_no_mac.redux("HLA-A*01:01:01", 'G'), "HLA-A*01:01:01G")
56+
self.assertEqual(ard_no_mac.redux("HLA-A*01:01:01", 'lg'), "HLA-A*01:01g")
57+
self.assertEqual(ard_no_mac.redux("HLA-A*01:01:01", 'lgx'), "HLA-A*01:01")
5858

5959
def test_remove_invalid(self):
6060
self.assertEqual(self.ard.redux("A*01:01:01", 'G'), "A*01:01:01G")

0 commit comments

Comments
 (0)