Skip to content

Commit 2684ee9

Browse files
authored
Merge pull request #4 from Regev32/master
Handles more mismatches.
2 parents 3ea7e01 + e4ffb9f commit 2684ee9

File tree

14 files changed

+147
-92
lines changed

14 files changed

+147
-92
lines changed

README.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ py-graph-match
33

44
Matching with Graph
55

6-
`grma`` is a package for finding HLA matches using graphs approach.
6+
`grma` is a package for finding HLA matches using graphs approach.
77
The matching is based on [grim's](https://github.com/nmdp-bioinformatics/py-graph-imputation) imputation.
88

99

data/donors_dir/donors.txt

100644100755
File mode changed.

grma/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,2 @@
11
from grma.donorsgraph.build_donors_graph import BuildMatchingGraph
2-
from grma.match import matching, find_matches
2+
from grma.match.match import matching, find_matches

grma/donorsgraph/build_donors_graph.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
from grma.utilities.geno_representation import HashableArray
1313
from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time
1414

15-
CLASS_I_END = 6
16-
1715

1816
class BuildMatchingGraph:
1917
"""
@@ -111,7 +109,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
111109
geno = gl_string_to_integers(geno)
112110

113111
# sort alleles for each HLA-X
114-
for x in range(0, 10, 2):
112+
for x in range(0, len(geno), 2):
115113
geno[x : x + 2] = sorted(geno[x : x + 2])
116114
geno = HashableArray(geno)
117115

@@ -137,6 +135,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
137135
# continue creation of classes and subclasses
138136
if geno not in layers["GENOTYPE"]:
139137
layers["GENOTYPE"].add(geno)
138+
CLASS_I_END = -2 * int(-len(geno)/4 - 0.5)
140139
geno_class1 = tuple(geno[:CLASS_I_END])
141140
geno_class2 = tuple(geno[CLASS_I_END:])
142141
self._create_classes_edges(geno, geno_class1, layers)

grma/donorsgraph/create_lol.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]):
7676
arrays_start = free
7777
# map lol-ids to arrays
7878
# given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
79+
geno = layers['GENOTYPE'].pop()
80+
layers['GENOTYPE'].add(geno)
7981
map_number_to_arr_node = np.zeros(
80-
(len(layers["GENOTYPE"]), 10), dtype=np.uint16
82+
(len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16
8183
)
8284
for i, geno in tqdm(
8385
enumerate(layers["GENOTYPE"]),

grma/match/donors_matching.py

Lines changed: 66 additions & 70 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020

2121
DONORS_DB: pd.DataFrame = pd.DataFrame()
2222
ZEROS: HashableArray = HashableArray([0])
23-
ALLELES_IN_CLASS_I: int = 6
24-
ALLELES_IN_CLASS_II: int = 4
2523

2624

2725
def set_database(donors_db: pd.DataFrame = pd.DataFrame()):
@@ -39,24 +37,13 @@ def _init_results_df(donors_info):
3937
fields_in_results = {
4038
"Patient_ID": [],
4139
"Donor_ID": [],
40+
"GvH_Mismatches": [],
41+
"HvG_Mismatches": [],
4242
"Number_Of_Mismatches": [],
4343
"Matching_Probability": [],
44-
"Match_Probability_A_1": [],
45-
"Match_Probability_A_2": [],
46-
"Match_Probability_B_1": [],
47-
"Match_Probability_B_2": [],
48-
"Match_Probability_C_1": [],
49-
"Match_Probability_C_2": [],
50-
"Match_Probability_DQB1_1": [],
51-
"Match_Probability_DQB1_2": [],
52-
"Match_Probability_DRB1_1": [],
53-
"Match_Probability_DRB1_2": [],
44+
"Match_Probability": [],
5445
"Permissive/Non-Permissive": [],
55-
"Match_Between_Most_Commons_A": [],
56-
"Match_Between_Most_Commons_B": [],
57-
"Match_Between_Most_Commons_C": [],
58-
"Match_Between_Most_Commons_DQB": [],
59-
"Match_Between_Most_Commons_DRB": [],
46+
"Match_Between_Most_Commons": [],
6047
}
6148

6249
donors_db_fields = DONORS_DB.columns.values.tolist()
@@ -66,17 +53,26 @@ def _init_results_df(donors_info):
6653
return pd.DataFrame(fields_in_results)
6754

6855

69-
def locuses_match_between_genos(geno1, geno2):
56+
def locuses_match_between_genos(geno_pat, geno_don):
7057
matches = []
71-
for i in range(5):
72-
a1, b1 = geno1[2 * i], geno1[2 * i + 1]
73-
a2, b2 = geno2[2 * i], geno2[2 * i + 1]
58+
total_gvh = 0
59+
total_hvg = 0
60+
61+
for i in range(0, len(geno_pat), 2):
62+
a1, b1 = geno_pat[i], geno_pat[i + 1]
63+
a2, b2 = geno_don[i], geno_don[i + 1]
7464

7565
s1 = int(a1 == a2) + int(b1 == b2)
7666
s2 = int(a1 == b2) + int(b1 == a2)
7767
matches.append(max(s1, s2))
7868

79-
return matches
69+
p_set = {x for x in (a1, b1) if x not in (None, 0)}
70+
d_set = {x for x in (a2, b2) if x not in (None, 0)}
71+
72+
total_gvh += len(p_set - d_set) # patient has, donor lacks
73+
total_hvg += len(d_set - p_set) # donor has, patient lacks
74+
75+
return matches, total_gvh, total_hvg
8076

8177

8278
class DonorsMatching(object):
@@ -127,7 +123,7 @@ def probability_to_allele(
127123
) -> List[float]:
128124
"""Takes a donor ID and a genotype.
129125
Returns the probability of match for each allele"""
130-
probs = [0 for _ in range(10)]
126+
probs = [0 for _ in range(len(pat_geno))]
131127

132128
for i, allele in enumerate(pat_geno):
133129
p = 0
@@ -148,7 +144,7 @@ def __find_genotype_candidates_from_class(
148144
) -> Tuple[np.ndarray, np.ndarray]:
149145
"""Takes an integer subclass.
150146
Returns the genotypes (ids and values) which are connected to it in the graph"""
151-
return self._graph.class_neighbors(clss)
147+
return self._graph.class_neighbors(clss, Len = len(list(self.patients.values())[0]))
152148

153149
def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]:
154150
"""Gets the LOL ID of a genotype.
@@ -216,12 +212,14 @@ def __add_matched_genos_to_graph(
216212

217213
def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
218214
subclasses = []
219-
classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]]
215+
ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5)
216+
ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I
217+
classes = [(genotype[:ALLELES_IN_CLASS_I], 0), (genotype[ALLELES_IN_CLASS_I:], 1)]
220218
num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II]
221219

222-
int_classes = [tuple_geno_to_int(tuple(clss)) for clss in classes]
220+
int_classes = [(tuple_geno_to_int(tuple(clss[0])), clss[1]) for clss in classes]
223221
for clss in int_classes:
224-
self._patients_graph.add_edge(clss, genotype)
222+
self._patients_graph.add_edge(clss[0], genotype)
225223

226224
# class one is considered as 0.
227225
# class two is considered as 1.
@@ -231,14 +229,14 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
231229
# set the missing allele to always be the second allele in the locus
232230
if k % 2 == 0:
233231
sub = tuple_geno_to_int(
234-
classes[class_num][0:k] + ZEROS + classes[class_num][k + 1 :]
232+
classes[class_num][0][0:k] + ZEROS + classes[class_num][0][k + 1 :]
235233
)
236234
else:
237235
sub = tuple_geno_to_int(
238-
classes[class_num][0 : k - 1]
236+
classes[class_num][0][0 : k - 1]
239237
+ ZEROS
240-
+ classes[class_num][k - 1 : k]
241-
+ classes[class_num][k + 1 :]
238+
+ classes[class_num][0][k - 1 : k]
239+
+ classes[class_num][0][k + 1 :]
242240
)
243241

244242
# missing allele number is the index of the first allele of the locus the missing allele belongs to.
@@ -255,6 +253,8 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
255253

256254
return int_classes, subclasses
257255

256+
257+
258258
def create_patients_graph(self, f_patients: str):
259259
"""
260260
create patients graph. \n
@@ -300,7 +300,8 @@ def create_patients_graph(self, f_patients: str):
300300
classes_by_patient[patient_id] = set()
301301

302302
# sort alleles for each HLA-X
303-
for x in range(0, 10, 2):
303+
l = len(geno)
304+
for x in range(0, l, 2):
304305
geno[x : x + 2] = sorted(geno[x : x + 2])
305306

306307
geno = HashableArray(geno)
@@ -351,15 +352,14 @@ def find_geno_candidates_by_subclasses(self, subclasses):
351352
genotypes_value,
352353
) = self.__find_genotype_candidates_from_subclass(subclass.subclass)
353354

355+
geno = genotypes_value[0]
356+
ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5)
357+
ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I
354358
# Checks only the locuses that are not certain to match
355359
if subclass.class_num == 0:
356-
allele_range_to_check = np.array(
357-
[6, 8, subclass.allele_num], dtype=np.uint8
358-
)
360+
allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)] + [subclass.allele_num], dtype=np.uint8)
359361
else:
360-
allele_range_to_check = np.array(
361-
[0, 2, 4, subclass.allele_num], dtype=np.uint8
362-
)
362+
allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)] + [subclass.allele_num], dtype=np.uint8)
363363

364364
# number of alleles that already match due to match in subclass
365365
matched_alleles: int = (
@@ -383,9 +383,9 @@ def find_geno_candidates_by_classes(self, classes):
383383
desc="finding classes matching candidates",
384384
disable=not self.verbose,
385385
):
386-
if self._graph.in_nodes(clss):
386+
if self._graph.in_nodes(clss[0]):
387387
patient_genos = self._patients_graph.neighbors(
388-
clss
388+
clss[0]
389389
) # The patient's genotypes which might be match
390390
(
391391
genotypes_ids,
@@ -395,12 +395,14 @@ def find_geno_candidates_by_classes(self, classes):
395395
# Checks only the locuses that are not certain to match (the locuses of the other class)
396396
# Class I appearances: 3 locuses = 6 alleles = 23/24 digits
397397
# Class II appearances: 2 locuses = 4 alleles = 15/16 digits
398-
if len(str(clss)) > 20:
399-
allele_range_to_check = np.array([6, 8], dtype=np.uint8)
400-
matched_alleles: int = 6
398+
geno = list(self.patients.values())[0]
399+
if clss[1] == 0:
400+
allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)], dtype=np.uint8)
401+
matched_alleles: int = len(geno)//2 + (len(geno)//2 & 1)
402+
401403
else:
402-
allele_range_to_check = np.array([0, 2, 4], dtype=np.uint8)
403-
matched_alleles: int = 4
404+
allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)], dtype=np.uint8)
405+
matched_alleles: int = len(geno)//2 - (len(geno)//2 & 1)
404406

405407
# Compares the candidate to the patient's genotypes, and adds the match geno candidates to the graph.
406408
self.__add_matched_genos_to_graph(
@@ -441,7 +443,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int):
441443
# and each patient connects only to their own genos, so we wouldn't override the weight dict.
442444
# self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE
443445
self._genotype_candidates[patient_id][geno_id] = {
444-
geno_num: (probability, 10)
446+
geno_num: (probability, len(geno))
445447
} # AMIT ADD
446448
# else:
447449
# print(f"Missing 'geno_num' for patient_id: {patient_id}")
@@ -507,7 +509,7 @@ def score_matches(
507509
].items(): # AMIT ADD
508510
for prob, matches in genotype_matches.values(): # AMIT CHANGE
509511
# match_info = (probability of patient's genotype, number of matches to patient's genotype)
510-
if matches != 10 - mismatch:
512+
if matches != len(list(self.patients.values())[0]) - mismatch:
511513
continue
512514

513515
# add the probabilities multiplication of the patient and all the donors that has this genotype
@@ -568,38 +570,32 @@ def __append_matching_donor(
568570
mm_number: int,
569571
) -> None:
570572
"""add a donor to the matches dictionary"""
571-
572-
compare_commons = locuses_match_between_genos(
573-
self.patients[patient], self.get_most_common_genotype(donor)
573+
pat = self.patients[patient]
574+
don = self.get_most_common_genotype(donor)
575+
compare_commons, gvh, hvg = locuses_match_between_genos(
576+
pat, don
574577
)
575578

576579
add_donors["Patient_ID"].append(patient)
577580
add_donors["Donor_ID"].append(donor)
578581
allele_prob = self.probability_to_allele(
579582
don_id=donor, pat_geno=self.patients[patient]
580583
)
581-
add_donors["Match_Probability_A_1"].append(allele_prob[0])
582-
add_donors["Match_Probability_A_2"].append(allele_prob[1])
583-
add_donors["Match_Probability_B_1"].append(allele_prob[2])
584-
add_donors["Match_Probability_B_2"].append(allele_prob[3])
585-
add_donors["Match_Probability_C_1"].append(allele_prob[4])
586-
add_donors["Match_Probability_C_2"].append(allele_prob[5])
587-
add_donors["Match_Probability_DQB1_1"].append(allele_prob[6])
588-
add_donors["Match_Probability_DQB1_2"].append(allele_prob[7])
589-
add_donors["Match_Probability_DRB1_1"].append(allele_prob[8])
590-
add_donors["Match_Probability_DRB1_2"].append(allele_prob[9])
591-
592-
add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0])
593-
add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1])
594-
add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2])
595-
add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3])
596-
add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4])
584+
add_donors["Match_Probability"].append(allele_prob)
585+
add_donors["Match_Between_Most_Commons"].append(compare_commons)
597586

598587
add_donors["Matching_Probability"].append(match_prob)
599-
add_donors["Number_Of_Mismatches"].append(mm_number)
600-
add_donors["Permissive/Non-Permissive"].append(
601-
"-"
602-
) # TODO: add permissiveness algorithm
588+
actual_mismatches = 0
589+
for match_score in compare_commons:
590+
if match_score != 2:
591+
actual_mismatches += (2 - match_score)
592+
593+
add_donors["Number_Of_Mismatches"].append(actual_mismatches)
594+
add_donors["GvH_Mismatches"].append(gvh)
595+
add_donors["HvG_Mismatches"].append(hvg)
596+
597+
add_donors["Permissive/Non-Permissive"].append("-")
598+
# TODO: add permissiveness algorithm
603599

604600
# add the other given fields to the results
605601
for field in donors_info:

grma/match/graph_wrapper.py

Lines changed: 4 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from typing import Union
66

77
import numpy as np
8-
98
from grma.utilities.geno_representation import HashableArray
109
from grma.match.lol_graph import LolGraph
1110

@@ -58,11 +57,10 @@ def get_edge_data(
5857
ret = self._graph.get_edge_data(node1_num, node2_num)
5958
return default if ret == exception_val else ret
6059

61-
def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False):
62-
node_num = self._map_node_to_number[node] if not search_lol_id else node
60+
def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10):
61+
node_num = self._map_node_to_number[node[0]] if not search_lol_id else node
6362
neighbors_list = self._graph.neighbors_unweighted(node_num)
64-
65-
neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16)
63+
neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16)
6664
for i, neighbor in enumerate(neighbors_list):
6765
neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor)
6866

@@ -112,7 +110,7 @@ def neighbors(
112110
def neighbors_2nd(self, node):
113111
node_num = self._map_node_to_number[node]
114112
r0, r1 = self._graph.neighbors_2nd(node_num)
115-
return r0[:-1], r1
113+
return r0, r1
116114

117115
def node_value_from_id(self, node_id: int) -> NODES_TYPES:
118116
"""convert lol ID to node value"""

grma/match/lol_graph.pyx

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,14 @@ cdef class LolGraph:
155155
cdef UINT16[:] arr
156156
cdef np.ndarray[UINT16, ndim=2] neighbors_value
157157
cdef UINT num_of_neighbors_2nd
158+
cdef UINT loci_len
158159

159160
idx = self._index_list[node]
160161
idx_end = self._index_list[node + 1]
161162

162163
neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32)
163164
for i in range(idx, idx_end):
164165
neighbors_list_id[i - idx] = self._neighbors_list[i]
165-
166166
num_of_neighbors_2nd = <UINT>self._weights_list[idx]
167167

168168
neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32)
@@ -177,11 +177,12 @@ cdef class LolGraph:
177177
neighbors_id[pointer] = self._neighbors_list[j]
178178
pointer += 1
179179

180-
neighbors_value = np.zeros((num_of_neighbors_2nd - 1, 10), dtype=np.uint16)
181-
for i in range(len(neighbors_id) - 1):
180+
loci_len = <UINT> self._map_number_to_arr_node.shape[1]
181+
neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16)
182+
for i in range(len(neighbors_id)):
182183
neighbor_id = neighbors_id[i]
183184
arr = self.arr_node_value_from_id(neighbor_id)
184-
for j in range(10):
185+
for j in range(loci_len):
185186
neighbors_value[i, j] = arr[j]
186187

187188
return neighbors_id, neighbors_value

0 commit comments

Comments
 (0)