Skip to content

Commit 0dd6321

Browse files
committed
feat(grma): generalize to k loci + compact tree storage
2 parents 3d4efe2 + c0ec68a commit 0dd6321

File tree

13 files changed

+310
-170
lines changed

13 files changed

+310
-170
lines changed

.github/PULL_REQUEST_TEMPLATE/pull_request_template.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,6 @@ Description of Pull Request..
33
## Checklist
44

55
- [ ] Unit tests
6-
- [ ] Documentation
6+
- [ ] Documentation
77

88
Fixes #

grma/.DS_Store

8 KB
Binary file not shown.

grma/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
2-
from grma.match import matching, find_matches
2+
from grma.match.match import matching, find_matches

grma/donorsgraph/build_donors_graph.py

Lines changed: 37 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,8 @@
1010
from grma.donorsgraph.create_lol import LolBuilder
1111
from grma.match.graph_wrapper import Graph
1212
from grma.utilities.geno_representation import HashableArray
13-
from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time
14-
15-
CLASS_I_END = 6
13+
from grma.utilities.utils import print_time, geno_to_int, gl_string_to_hash
14+
from bidict import bidict
1615

1716

1817
class BuildMatchingGraph:
@@ -21,7 +20,7 @@ class BuildMatchingGraph:
2120
It gets a path to directory with the donors' file, builds the graph and saved it as LOL graph using Cython.
2221
"""
2322

24-
__slots__ = "_verbose", "_graph", "_edges"
23+
__slots__ = "_verbose", "_graph", "_edges", "bidirectional_dict"
2524

2625
def __init__(self, path_to_donors_directory: str, verbose: bool = False):
2726
"""
@@ -33,19 +32,20 @@ def __init__(self, path_to_donors_directory: str, verbose: bool = False):
3332
self._verbose = verbose
3433
self._graph = None # LOL dict-representation
3534
self._edges: List[Edge] = [] # edge-list
35+
self.bidirectional_dict = bidict()
3636
self._save_graph_as_edges(path_to_donors_directory)
3737

3838
def _create_classes_edges(self, geno, class_, layers):
39-
int_class = tuple_geno_to_int(class_)
39+
hash_class = gl_string_to_hash(str(class_)) % 1000000000 + 1000000000
4040

41-
self._edges.append(Edge(int_class, geno, 0))
41+
self._edges.append(Edge(hash_class, geno, 0))
4242

4343
# check if the class node was created
44-
if int_class not in layers["CLASS"]:
45-
layers["CLASS"].add(int_class)
46-
self._create_subclass_edges(class_, int_class, layers)
44+
if hash_class not in layers["CLASS"]:
45+
layers["CLASS"].add(hash_class)
46+
self._create_subclass_edges(class_, hash_class, layers)
4747

48-
def _create_subclass_edges(self, class_, int_class, layers):
48+
def _create_subclass_edges(self, class_, hash_class, layers):
4949
"""
5050
subclasses edges are created by dropping an allele from a class.
5151
each allele we drop, will be replaced with zero,
@@ -58,19 +58,18 @@ def _create_subclass_edges(self, class_, int_class, layers):
5858
# set the missing allele to always be the second allele in the locus
5959
for i in range(num_of_alleles):
6060
if i % 2 == 0:
61-
subclass_alleles.add(
62-
tuple_geno_to_int(tuple(class_[0:i] + (0,) + class_[i + 1 :]))
63-
)
61+
subclass = tuple(class_[0:i] + (0,) + class_[i + 1 :])
6462
else:
65-
subclass_alleles.add(
66-
tuple_geno_to_int(
67-
tuple(class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :])
68-
)
63+
subclass = tuple(
64+
class_[0 : i - 1] + (0, class_[i - 1]) + class_[i + 1 :]
6965
)
7066

67+
hash_subclass = gl_string_to_hash(str(subclass)) % 1000000000 + 2000000000
68+
subclass_alleles.add(hash_subclass)
69+
7170
# add subclass->class edges
7271
for sub in subclass_alleles:
73-
self._edges.append(Edge(sub, int_class, 0))
72+
self._edges.append(Edge(sub, hash_class, 0))
7473
if sub not in layers["SUBCLASS"]:
7574
layers["SUBCLASS"].add(sub)
7675

@@ -103,16 +102,27 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
103102
):
104103
# retrieve all line's parameters
105104
donor_id, geno, probability, index = line.strip().split(",")
106-
donor_id = int(donor_id)
105+
donor_id = -1 * int(donor_id)
107106
index = int(index)
108107
probability = float(probability)
109108

110109
# convert geno to list of integers
111-
geno = gl_string_to_integers(geno)
112-
113-
# sort alleles for each HLA-X
114-
for x in range(0, 10, 2):
115-
geno[x : x + 2] = sorted(geno[x : x + 2])
110+
alleles = [
111+
allele
112+
for locus in geno.split("^")
113+
for allele in locus.split("+")
114+
]
115+
geno = []
116+
for allele in alleles:
117+
if "N" in allele:
118+
geno.append(0)
119+
elif allele in self.bidirectional_dict:
120+
geno.append(self.bidirectional_dict[allele])
121+
else:
122+
self.bidirectional_dict[allele] = (
123+
len(self.bidirectional_dict) + 3
124+
)
125+
geno.append(self.bidirectional_dict[allele])
116126
geno = HashableArray(geno)
117127

118128
# handle new donor appearance in file
@@ -137,6 +147,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
137147
# continue creation of classes and subclasses
138148
if geno not in layers["GENOTYPE"]:
139149
layers["GENOTYPE"].add(geno)
150+
CLASS_I_END = -2 * int(-len(geno) / 4 - 0.5)
140151
geno_class1 = tuple(geno[:CLASS_I_END])
141152
geno_class2 = tuple(geno[CLASS_I_END:])
142153
self._create_classes_edges(geno, geno_class1, layers)
@@ -177,4 +188,5 @@ def to_pickle(self, path: Union[str, os.PathLike]):
177188
178189
:param path: A path to save the pickled object
179190
"""
180-
pickle.dump(self._graph, open(path, "wb"))
191+
graph_bdict = [self._graph, self.bidirectional_dict]
192+
pickle.dump(graph_bdict, open(path, "wb"))

grma/donorsgraph/create_lol.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from collections import OrderedDict
77

88
from grma.donorsgraph import Edge
9-
from grma.utilities.utils import print_time, tuple_geno_to_int
9+
from grma.utilities.utils import print_time
1010

1111

1212
class LolBuilder:
@@ -76,16 +76,18 @@ def _convert(self, layers: Dict[str, Set]):
7676
arrays_start = free
7777
# map lol-ids to arrays
7878
# given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
79+
geno = layers["GENOTYPE"].pop()
80+
layers["GENOTYPE"].add(geno)
7981
map_number_to_arr_node = np.zeros(
80-
(len(layers["GENOTYPE"]), 10), dtype=np.uint16
82+
(len(layers["GENOTYPE"]), len(geno)), dtype=np.uint32
8183
)
8284
for i, geno in tqdm(
8385
enumerate(layers["GENOTYPE"]),
8486
desc="(1.3) Map nodes to internal numbers",
8587
disable=not self._verbose,
8688
):
8789
map_node_to_number[geno] = free
88-
map_number_to_arr_node[i, :] = geno.np()
90+
map_number_to_arr_node[i] = geno
8991
free += 1
9092

9193
# map classes to lol-id.
@@ -110,7 +112,7 @@ def _convert(self, layers: Dict[str, Set]):
110112
)
111113
if y < subclasses_start
112114
],
113-
dtype=np.uint32,
115+
dtype=np.int32,
114116
)
115117

116118
print_time("(3/6) Create the index list")
@@ -184,12 +186,6 @@ def _convert(self, layers: Dict[str, Set]):
184186
weights_list=weights_list,
185187
)
186188

187-
# replace geno hashable array to more efficient representation.
188-
for array_geno in layers["GENOTYPE"]:
189-
int_geno = tuple_geno_to_int(array_geno)
190-
map_node_to_number[int_geno] = map_node_to_number[array_geno]
191-
del map_node_to_number[array_geno]
192-
193189
del self._graph
194190
del layers
195191
gc.collect()

0 commit comments

Comments
 (0)