Skip to content

Commit e584348

Browse files
committed
Moved from hardcoded 5 loci to k-loci. Not tested yet.
1 parent ee48c65 commit e584348

File tree

8 files changed

+69
-94
lines changed

8 files changed

+69
-94
lines changed

data/donors_dir/donors.txt

100644100755
File mode changed.

grma/donorsgraph/build_donors_graph.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,8 +12,6 @@
1212
from grma.utilities.geno_representation import HashableArray
1313
from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time
1414

15-
CLASS_I_END = 6
16-
1715

1816
class BuildMatchingGraph:
1917
"""
@@ -111,7 +109,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
111109
geno = gl_string_to_integers(geno)
112110

113111
# sort alleles for each HLA-X
114-
for x in range(0, 10, 2):
112+
for x in range(0, len(geno), 2):
115113
geno[x : x + 2] = sorted(geno[x : x + 2])
116114
geno = HashableArray(geno)
117115

@@ -137,6 +135,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
137135
# continue creation of classes and subclasses
138136
if geno not in layers["GENOTYPE"]:
139137
layers["GENOTYPE"].add(geno)
138+
CLASS_I_END = -2 * int(-len(geno)/4 - 0.5)
140139
geno_class1 = tuple(geno[:CLASS_I_END])
141140
geno_class2 = tuple(geno[CLASS_I_END:])
142141
self._create_classes_edges(geno, geno_class1, layers)

grma/donorsgraph/create_lol.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]):
7676
arrays_start = free
7777
# map lol-ids to arrays
7878
# given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
79+
geno = layers['GENOTYPE'].pop()
80+
layers['GENOTYPE'].add(geno)
7981
map_number_to_arr_node = np.zeros(
80-
(len(layers["GENOTYPE"]), 10), dtype=np.uint16
82+
(len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16
8183
)
8284
for i, geno in tqdm(
8385
enumerate(layers["GENOTYPE"]),

grma/match/donors_matching.py

Lines changed: 36 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -20,8 +20,6 @@
2020

2121
DONORS_DB: pd.DataFrame = pd.DataFrame()
2222
ZEROS: HashableArray = HashableArray([0])
23-
ALLELES_IN_CLASS_I: int = 6
24-
ALLELES_IN_CLASS_II: int = 4
2523

2624

2725
def set_database(donors_db: pd.DataFrame = pd.DataFrame()):
@@ -43,22 +41,9 @@ def _init_results_df(donors_info):
4341
"HvG_Mismatches": [],
4442
"Number_Of_Mismatches": [],
4543
"Matching_Probability": [],
46-
"Match_Probability_A_1": [],
47-
"Match_Probability_A_2": [],
48-
"Match_Probability_B_1": [],
49-
"Match_Probability_B_2": [],
50-
"Match_Probability_C_1": [],
51-
"Match_Probability_C_2": [],
52-
"Match_Probability_DQB1_1": [],
53-
"Match_Probability_DQB1_2": [],
54-
"Match_Probability_DRB1_1": [],
55-
"Match_Probability_DRB1_2": [],
44+
"Match_Probability": [],
5645
"Permissive/Non-Permissive": [],
57-
"Match_Between_Most_Commons_A": [],
58-
"Match_Between_Most_Commons_B": [],
59-
"Match_Between_Most_Commons_C": [],
60-
"Match_Between_Most_Commons_DQB": [],
61-
"Match_Between_Most_Commons_DRB": [],
46+
"Match_Between_Most_Commons": [],
6247
}
6348

6449
donors_db_fields = DONORS_DB.columns.values.tolist()
@@ -68,17 +53,26 @@ def _init_results_df(donors_info):
6853
return pd.DataFrame(fields_in_results)
6954

7055

71-
def locuses_match_between_genos(geno1, geno2):
56+
def locuses_match_between_genos(geno_pat, geno_don):
7257
matches = []
73-
for i in range(5):
74-
a1, b1 = geno1[2 * i], geno1[2 * i + 1]
75-
a2, b2 = geno2[2 * i], geno2[2 * i + 1]
58+
total_gvh = 0
59+
total_hvg = 0
60+
61+
for i in range(0, len(geno_pat), 2):
62+
a1, b1 = geno_pat[i], geno_pat[i + 1]
63+
a2, b2 = geno_don[i], geno_don[i + 1]
7664

7765
s1 = int(a1 == a2) + int(b1 == b2)
7866
s2 = int(a1 == b2) + int(b1 == a2)
7967
matches.append(max(s1, s2))
8068

81-
return matches
69+
p_set = {x for x in (a1, b1) if x not in (None, 0)}
70+
d_set = {x for x in (a2, b2) if x not in (None, 0)}
71+
72+
total_gvh += len(p_set - d_set) # patient has, donor lacks
73+
total_hvg += len(d_set - p_set) # donor has, patient lacks
74+
75+
return matches, total_gvh, total_hvg
8276

8377

8478
class DonorsMatching(object):
@@ -129,7 +123,7 @@ def probability_to_allele(
129123
) -> List[float]:
130124
"""Takes a donor ID and a genotype.
131125
Returns the probability of match for each allele"""
132-
probs = [0 for _ in range(10)]
126+
probs = [0 for _ in range(len(pat_geno))]
133127

134128
for i, allele in enumerate(pat_geno):
135129
p = 0
@@ -150,7 +144,7 @@ def __find_genotype_candidates_from_class(
150144
) -> Tuple[np.ndarray, np.ndarray]:
151145
"""Takes an integer subclass.
152146
Returns the genotypes (ids and values) which are connected to it in the graph"""
153-
return self._graph.class_neighbors(clss)
147+
return self._graph.class_neighbors(clss, Len = len(self.patients[0]))
154148

155149
def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]:
156150
"""Gets the LOL ID of a genotype.
@@ -218,6 +212,8 @@ def __add_matched_genos_to_graph(
218212

219213
def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
220214
subclasses = []
215+
ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5)
216+
ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I
221217
classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]]
222218
num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II]
223219

@@ -257,34 +253,7 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
257253

258254
return int_classes, subclasses
259255

260-
def count_GvH_HvG(
261-
self,
262-
pat_geno: Sequence[int],
263-
don_geno: Sequence[int],
264-
) -> Tuple[int, int]:
265-
"""
266-
Count GvH and HvG mismatches locus by locus by set‐difference.
267-
Each locus is two slots in the genotype lists:
268-
A: indices [0,1], B: [2,3], C: [4,5], DQB1: [6,7], DRB1: [8,9]
269-
We drop any “N” (here encoded as 0 or None), then:
270-
GvH = | patient_set – donor_set |
271-
HvG = | donor_set – patient_set |
272-
Sum over all five loci.
273-
"""
274-
total_gvh = 0
275-
total_hvg = 0
276-
277-
for i in range(0, 10, 2):
278-
# build the allele sets, filtering out N/None/0
279-
p_set = {a for a in (pat_geno[i], pat_geno[i + 1]) if a not in (None, 0)}
280-
d_set = {a for a in (don_geno[i], don_geno[i + 1]) if a not in (None, 0)}
281-
282-
# how many the patient has that the donor doesn’t:
283-
total_gvh += len(p_set - d_set)
284-
# how many the donor has that the patient doesn’t:
285-
total_hvg += len(d_set - p_set)
286256

287-
return total_gvh, total_hvg
288257

289258
def create_patients_graph(self, f_patients: str):
290259
"""
@@ -331,7 +300,8 @@ def create_patients_graph(self, f_patients: str):
331300
classes_by_patient[patient_id] = set()
332301

333302
# sort alleles for each HLA-X
334-
for x in range(0, 10, 2):
303+
l = len(geno)
304+
for x in range(0, l, 2):
335305
geno[x : x + 2] = sorted(geno[x : x + 2])
336306

337307
geno = HashableArray(geno)
@@ -382,14 +352,18 @@ def find_geno_candidates_by_subclasses(self, subclasses):
382352
genotypes_value,
383353
) = self.__find_genotype_candidates_from_subclass(subclass.subclass)
384354

355+
geno = genotypes_value[0]
356+
ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5)
357+
ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I
385358
# Checks only the locuses that are not certain to match
386359
if subclass.class_num == 0:
387360
allele_range_to_check = np.array(
388-
[6, 8, subclass.allele_num], dtype=np.uint8
361+
[c for c in range(ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_I + ALLELES_IN_CLASS_I - 2, 2)] + [subclass.allele_num],
362+
dtype=np.uint8
389363
)
390364
else:
391365
allele_range_to_check = np.array(
392-
[0, 2, 4, subclass.allele_num], dtype=np.uint8
366+
[c for c in range(0, ALLELES_IN_CLASS_I, 2)] + [subclass.allele_num], dtype=np.uint8
393367
)
394368

395369
# number of alleles that already match due to match in subclass
@@ -472,7 +446,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int):
472446
# and each patient connects only to their own genos, so we wouldn't override the weight dict.
473447
# self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE
474448
self._genotype_candidates[patient_id][geno_id] = {
475-
geno_num: (probability, 10)
449+
geno_num: (probability, len(geno))
476450
} # AMIT ADD
477451
# else:
478452
# print(f"Missing 'geno_num' for patient_id: {patient_id}")
@@ -538,7 +512,7 @@ def score_matches(
538512
].items(): # AMIT ADD
539513
for prob, matches in genotype_matches.values(): # AMIT CHANGE
540514
# match_info = (probability of patient's genotype, number of matches to patient's genotype)
541-
if matches != 10 - mismatch:
515+
if matches != len(self.patients[1]) - mismatch:
542516
continue
543517

544518
# add the probabilities multiplication of the patient and all the donors that has this genotype
@@ -599,46 +573,27 @@ def __append_matching_donor(
599573
mm_number: int,
600574
) -> None:
601575
"""add a donor to the matches dictionary"""
602-
603-
compare_commons = locuses_match_between_genos(
604-
self.patients[patient], self.get_most_common_genotype(donor)
576+
pat = self.patients[patient]
577+
don = self.get_most_common_genotype(donor)
578+
compare_commons, gvh, hvg = locuses_match_between_genos(
579+
pat, don
605580
)
606581

607582
add_donors["Patient_ID"].append(patient)
608583
add_donors["Donor_ID"].append(donor)
609584
allele_prob = self.probability_to_allele(
610585
don_id=donor, pat_geno=self.patients[patient]
611586
)
612-
add_donors["Match_Probability_A_1"].append(allele_prob[0])
613-
add_donors["Match_Probability_A_2"].append(allele_prob[1])
614-
add_donors["Match_Probability_B_1"].append(allele_prob[2])
615-
add_donors["Match_Probability_B_2"].append(allele_prob[3])
616-
add_donors["Match_Probability_C_1"].append(allele_prob[4])
617-
add_donors["Match_Probability_C_2"].append(allele_prob[5])
618-
add_donors["Match_Probability_DQB1_1"].append(allele_prob[6])
619-
add_donors["Match_Probability_DQB1_2"].append(allele_prob[7])
620-
add_donors["Match_Probability_DRB1_1"].append(allele_prob[8])
621-
add_donors["Match_Probability_DRB1_2"].append(allele_prob[9])
622-
623-
add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0])
624-
add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1])
625-
add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2])
626-
add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3])
627-
add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4])
587+
add_donors["Match_Probability"].append(allele_prob)
588+
add_donors["Match_Between_Most_Commons"].append(compare_commons)
628589

629590
add_donors["Matching_Probability"].append(match_prob)
630-
631591
actual_mismatches = 0
632592
for match_score in compare_commons:
633593
if match_score != 2:
634594
actual_mismatches += (2 - match_score)
635595

636596
add_donors["Number_Of_Mismatches"].append(actual_mismatches)
637-
638-
# compute GvH / HvG counts
639-
pat = self.patients[patient]
640-
don = self.get_most_common_genotype(donor)
641-
gvh, hvg = self.count_GvH_HvG(pat, don)
642597
add_donors["GvH_Mismatches"].append(gvh)
643598
add_donors["HvG_Mismatches"].append(hvg)
644599

grma/match/graph_wrapper.py

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,6 @@
55
from typing import Union
66

77
import numpy as np
8-
98
from grma.utilities.geno_representation import HashableArray
109
from grma.match.lol_graph import LolGraph
1110

@@ -58,11 +57,10 @@ def get_edge_data(
5857
ret = self._graph.get_edge_data(node1_num, node2_num)
5958
return default if ret == exception_val else ret
6059

61-
def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False):
60+
def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10):
6261
node_num = self._map_node_to_number[node] if not search_lol_id else node
6362
neighbors_list = self._graph.neighbors_unweighted(node_num)
64-
65-
neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16)
63+
neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16)
6664
for i, neighbor in enumerate(neighbors_list):
6765
neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor)
6866

grma/match/lol_graph.pyx

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -155,14 +155,14 @@ cdef class LolGraph:
155155
cdef UINT16[:] arr
156156
cdef np.ndarray[UINT16, ndim=2] neighbors_value
157157
cdef UINT num_of_neighbors_2nd
158+
cdef UINT loci_len
158159

159160
idx = self._index_list[node]
160161
idx_end = self._index_list[node + 1]
161162

162163
neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32)
163164
for i in range(idx, idx_end):
164165
neighbors_list_id[i - idx] = self._neighbors_list[i]
165-
166166
num_of_neighbors_2nd = <UINT>self._weights_list[idx]
167167

168168
neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32)
@@ -177,11 +177,12 @@ cdef class LolGraph:
177177
neighbors_id[pointer] = self._neighbors_list[j]
178178
pointer += 1
179179

180-
neighbors_value = np.zeros((num_of_neighbors_2nd, 10), dtype=np.uint16)
180+
loci_len = <UINT> self._map_number_to_arr_node.shape[1]
181+
neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16)
181182
for i in range(len(neighbors_id)):
182183
neighbor_id = neighbors_id[i]
183184
arr = self.arr_node_value_from_id(neighbor_id)
184-
for j in range(10):
185+
for j in range(loci_len):
185186
neighbors_value[i, j] = arr[j]
186187

187188
return neighbors_id, neighbors_value

grma/match/match.py

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
import pandas as pd
99
from grim import grim
1010
import csv
11+
import ast
1112

1213
from grma.match import Graph as MatchingGraph
1314
from grma.match.donors_matching import DonorsMatching, _init_results_df
@@ -195,6 +196,9 @@ def find_matches(
195196

196197
# the returned dictionary. {patient ID: pd.DataFrame(matches + features)}
197198
patients_results = {patient: None for patient in patients}
199+
with open(imputation_filename, "r") as f:
200+
line = f.readline().strip()
201+
loci = list(dict.fromkeys([item for item in line.split(',')[1].replace('*', ',').replace('+', ',').replace('^', ',').replace(':', ',').split(',') if not item.isdigit() and item]))
198202

199203
if patients:
200204
avg_build_time = (end_build_graph - start_build_graph) / len(patients)
@@ -217,6 +221,23 @@ def find_matches(
217221
patient, g_m, donors_info, threshold, cutoff, classes, subclasses
218222
)
219223

224+
225+
match_Probability = results_df["Match_Probability"]
226+
match_Between_Most_Commons = results_df["Match_Between_Most_Commons"]
227+
Permissive = results_df["Permissive/Non-Permissive"]
228+
df_new = results_df.drop(columns=['Match_Probability', 'Match_Between_Most_Commons', 'Permissive/Non-Permissive']).copy()
229+
230+
for idx, row in enumerate(match_Probability):
231+
k = 0
232+
for locus in loci:
233+
for i in [1, 2]:
234+
df_new.loc[idx, f"Match_Probability_{locus}_{i}"] = row[k]
235+
k += 1
236+
df_new['Permissive/Non-Permissive'] = Permissive
237+
for idx, row in enumerate(match_Between_Most_Commons):
238+
for l, locus in enumerate(loci):
239+
df_new.loc[idx, f"Match_Between_Most_Commons_{locus}"] = row[l]
240+
results_df = df_new.copy()
220241
end = time.time()
221242
patient_time = end - start + avg_build_time
222243

@@ -239,7 +260,6 @@ def find_matches(
239260
f"Saved Matching results for {patient} in "
240261
f"{os.path.join(f'{output_dir}/search_{search_id}', f'Patient_{patient}.csv')}"
241262
)
242-
243263
return patients_results
244264

245265

grma/utilities/cutils.pyx

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -83,7 +83,7 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati
8383
if counted - count_similar > 3:
8484
similarities[i] = -1
8585
break
86-
if 10 - count_similar > 3:
86+
if len(donors_geno) - count_similar > 3:
8787
similarities[i] = -1
8888
else:
8989
similarities[i] = count_similar

0 commit comments

Comments
 (0)