Moved from hardcoded 5 loci to k-loci. Not tested yet.

Regev32 · Regev32 · commit e5843483de94 · 2025-08-24T17:29:14.000+03:00
diff --git a/data/donors_dir/donors.txt b/data/donors_dir/donors.txt
diff --git a/grma/donorsgraph/build_donors_graph.py b/grma/donorsgraph/build_donors_graph.py
@@ -12,8 +12,6 @@
 from grma.utilities.geno_representation import HashableArray
 from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time
 
-CLASS_I_END = 6
-
 
 class BuildMatchingGraph:
     """
@@ -111,7 +109,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
                     geno = gl_string_to_integers(geno)
 
                     # sort alleles for each HLA-X
-                    for x in range(0, 10, 2):
+                    for x in range(0, len(geno), 2):
                         geno[x : x + 2] = sorted(geno[x : x + 2])
                     geno = HashableArray(geno)
 
@@ -137,6 +135,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike):
                     # continue creation of classes and subclasses
                     if geno not in layers["GENOTYPE"]:
                         layers["GENOTYPE"].add(geno)
+                        CLASS_I_END = -2 * int(-len(geno)/4 - 0.5)
                         geno_class1 = tuple(geno[:CLASS_I_END])
                         geno_class2 = tuple(geno[CLASS_I_END:])
                         self._create_classes_edges(geno, geno_class1, layers)
diff --git a/grma/donorsgraph/create_lol.py b/grma/donorsgraph/create_lol.py
@@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]):
         arrays_start = free
         # map lol-ids to arrays
         # given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :]
+        geno = layers['GENOTYPE'].pop()
+        layers['GENOTYPE'].add(geno)
         map_number_to_arr_node = np.zeros(
-            (len(layers["GENOTYPE"]), 10), dtype=np.uint16
+            (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16
         )
         for i, geno in tqdm(
             enumerate(layers["GENOTYPE"]),
diff --git a/grma/match/donors_matching.py b/grma/match/donors_matching.py
@@ -20,8 +20,6 @@
 
 DONORS_DB: pd.DataFrame = pd.DataFrame()
 ZEROS: HashableArray = HashableArray([0])
-ALLELES_IN_CLASS_I: int = 6
-ALLELES_IN_CLASS_II: int = 4
 
 
 def set_database(donors_db: pd.DataFrame = pd.DataFrame()):
@@ -43,22 +41,9 @@ def _init_results_df(donors_info):
         "HvG_Mismatches": [],
         "Number_Of_Mismatches": [],
         "Matching_Probability": [],
-        "Match_Probability_A_1": [],
-        "Match_Probability_A_2": [],
-        "Match_Probability_B_1": [],
-        "Match_Probability_B_2": [],
-        "Match_Probability_C_1": [],
-        "Match_Probability_C_2": [],
-        "Match_Probability_DQB1_1": [],
-        "Match_Probability_DQB1_2": [],
-        "Match_Probability_DRB1_1": [],
-        "Match_Probability_DRB1_2": [],
+        "Match_Probability": [],
         "Permissive/Non-Permissive": [],
-        "Match_Between_Most_Commons_A": [],
-        "Match_Between_Most_Commons_B": [],
-        "Match_Between_Most_Commons_C": [],
-        "Match_Between_Most_Commons_DQB": [],
-        "Match_Between_Most_Commons_DRB": [],
+        "Match_Between_Most_Commons": [],
     }
 
     donors_db_fields = DONORS_DB.columns.values.tolist()
@@ -68,17 +53,26 @@ def _init_results_df(donors_info):
     return pd.DataFrame(fields_in_results)
 
 
-def locuses_match_between_genos(geno1, geno2):
+def locuses_match_between_genos(geno_pat, geno_don):
     matches = []
-    for i in range(5):
-        a1, b1 = geno1[2 * i], geno1[2 * i + 1]
-        a2, b2 = geno2[2 * i], geno2[2 * i + 1]
+    total_gvh = 0
+    total_hvg = 0
+
+    for i in range(0, len(geno_pat), 2):
+        a1, b1 = geno_pat[i],   geno_pat[i + 1]
+        a2, b2 = geno_don[i],   geno_don[i + 1]
 
         s1 = int(a1 == a2) + int(b1 == b2)
         s2 = int(a1 == b2) + int(b1 == a2)
         matches.append(max(s1, s2))
 
-    return matches
+        p_set = {x for x in (a1, b1) if x not in (None, 0)}
+        d_set = {x for x in (a2, b2) if x not in (None, 0)}
+
+        total_gvh += len(p_set - d_set)  # patient has, donor lacks
+        total_hvg += len(d_set - p_set)  # donor has, patient lacks
+
+    return matches, total_gvh, total_hvg
 
 
 class DonorsMatching(object):
@@ -129,7 +123,7 @@ def probability_to_allele(
     ) -> List[float]:
         """Takes a donor ID and a genotype.
         Returns the probability of match for each allele"""
-        probs = [0 for _ in range(10)]
+        probs = [0 for _ in range(len(pat_geno))]
 
         for i, allele in enumerate(pat_geno):
             p = 0
@@ -150,7 +144,7 @@ def __find_genotype_candidates_from_class(
     ) -> Tuple[np.ndarray, np.ndarray]:
         """Takes an integer subclass.
         Returns the genotypes (ids and values) which are connected to it in the graph"""
-        return self._graph.class_neighbors(clss)
+        return self._graph.class_neighbors(clss, Len = len(self.patients[0]))
 
     def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]:
         """Gets the LOL ID of a genotype.
@@ -218,6 +212,8 @@ def __add_matched_genos_to_graph(
 
     def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
         subclasses = []
+        ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5)
+        ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I
         classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]]
         num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II]
 
@@ -257,34 +253,7 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray):
 
         return int_classes, subclasses
 
-    def count_GvH_HvG(
-            self,
-            pat_geno: Sequence[int],
-            don_geno: Sequence[int],
-    ) -> Tuple[int, int]:
-        """
-        Count GvH and HvG mismatches locus by locus by set‐difference.
-        Each locus is two slots in the genotype lists:
-          A: indices [0,1], B: [2,3], C: [4,5], DQB1: [6,7], DRB1: [8,9]
-        We drop any “N” (here encoded as 0 or None), then:
-          GvH = | patient_set – donor_set |
-          HvG = | donor_set – patient_set |
-        Sum over all five loci.
-        """
-        total_gvh = 0
-        total_hvg = 0
-
-        for i in range(0, 10, 2):
-            # build the allele sets, filtering out N/None/0
-            p_set = {a for a in (pat_geno[i], pat_geno[i + 1]) if a not in (None, 0)}
-            d_set = {a for a in (don_geno[i], don_geno[i + 1]) if a not in (None, 0)}
-
-            # how many the patient has that the donor doesn’t:
-            total_gvh += len(p_set - d_set)
-            # how many the donor has that the patient doesn’t:
-            total_hvg += len(d_set - p_set)
 
-        return total_gvh, total_hvg
 
     def create_patients_graph(self, f_patients: str):
         """
@@ -331,7 +300,8 @@ def create_patients_graph(self, f_patients: str):
                 classes_by_patient[patient_id] = set()
 
             # sort alleles for each HLA-X
-            for x in range(0, 10, 2):
+            l = len(geno)
+            for x in range(0, l, 2):
                 geno[x : x + 2] = sorted(geno[x : x + 2])
 
             geno = HashableArray(geno)
@@ -382,14 +352,18 @@ def find_geno_candidates_by_subclasses(self, subclasses):
                     genotypes_value,
                 ) = self.__find_genotype_candidates_from_subclass(subclass.subclass)
 
+                geno = genotypes_value[0]
+                ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5)
+                ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I
                 # Checks only the locuses that are not certain to match
                 if subclass.class_num == 0:
                     allele_range_to_check = np.array(
-                        [6, 8, subclass.allele_num], dtype=np.uint8
+                        [c for c in range(ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_I + ALLELES_IN_CLASS_I - 2, 2)] + [subclass.allele_num],
+                        dtype=np.uint8
                     )
                 else:
                     allele_range_to_check = np.array(
-                        [0, 2, 4, subclass.allele_num], dtype=np.uint8
+                        [c for c in range(0, ALLELES_IN_CLASS_I, 2)] + [subclass.allele_num], dtype=np.uint8
                     )
 
                 # number of alleles that already match due to match in subclass
@@ -472,7 +446,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int):
             # and each patient connects only to their own genos, so we wouldn't override the weight dict.
             # self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE
             self._genotype_candidates[patient_id][geno_id] = {
-                geno_num: (probability, 10)
+                geno_num: (probability, len(geno))
             }  # AMIT ADD
             # else:
             #     print(f"Missing 'geno_num' for patient_id: {patient_id}")
@@ -538,7 +512,7 @@ def score_matches(
         ].items():  # AMIT ADD
             for prob, matches in genotype_matches.values():  # AMIT CHANGE
                 # match_info = (probability of patient's genotype, number of matches to patient's genotype)
-                if matches != 10 - mismatch:
+                if matches != len(self.patients[1]) - mismatch:
                     continue
 
                 # add the probabilities multiplication of the patient and all the donors that has this genotype
@@ -599,46 +573,27 @@ def __append_matching_donor(
         mm_number: int,
     ) -> None:
         """add a donor to the matches dictionary"""
-
-        compare_commons = locuses_match_between_genos(
-            self.patients[patient], self.get_most_common_genotype(donor)
+        pat = self.patients[patient]
+        don = self.get_most_common_genotype(donor)
+        compare_commons, gvh, hvg = locuses_match_between_genos(
+            pat, don
         )
 
         add_donors["Patient_ID"].append(patient)
         add_donors["Donor_ID"].append(donor)
         allele_prob = self.probability_to_allele(
             don_id=donor, pat_geno=self.patients[patient]
         )
-        add_donors["Match_Probability_A_1"].append(allele_prob[0])
-        add_donors["Match_Probability_A_2"].append(allele_prob[1])
-        add_donors["Match_Probability_B_1"].append(allele_prob[2])
-        add_donors["Match_Probability_B_2"].append(allele_prob[3])
-        add_donors["Match_Probability_C_1"].append(allele_prob[4])
-        add_donors["Match_Probability_C_2"].append(allele_prob[5])
-        add_donors["Match_Probability_DQB1_1"].append(allele_prob[6])
-        add_donors["Match_Probability_DQB1_2"].append(allele_prob[7])
-        add_donors["Match_Probability_DRB1_1"].append(allele_prob[8])
-        add_donors["Match_Probability_DRB1_2"].append(allele_prob[9])
-
-        add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0])
-        add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1])
-        add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2])
-        add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3])
-        add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4])
+        add_donors["Match_Probability"].append(allele_prob)
+        add_donors["Match_Between_Most_Commons"].append(compare_commons)
 
         add_donors["Matching_Probability"].append(match_prob)
-        
         actual_mismatches = 0
         for match_score in compare_commons:
             if match_score != 2:
                 actual_mismatches += (2 - match_score)
 
         add_donors["Number_Of_Mismatches"].append(actual_mismatches)
-
-        # compute GvH / HvG counts
-        pat = self.patients[patient]
-        don = self.get_most_common_genotype(donor)
-        gvh, hvg = self.count_GvH_HvG(pat, don)
         add_donors["GvH_Mismatches"].append(gvh)
         add_donors["HvG_Mismatches"].append(hvg)
 
diff --git a/grma/match/graph_wrapper.py b/grma/match/graph_wrapper.py
@@ -5,7 +5,6 @@
 from typing import Union
 
 import numpy as np
-
 from grma.utilities.geno_representation import HashableArray
 from grma.match.lol_graph import LolGraph
 
@@ -58,11 +57,10 @@ def get_edge_data(
         ret = self._graph.get_edge_data(node1_num, node2_num)
         return default if ret == exception_val else ret
 
-    def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False):
+    def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10):
         node_num = self._map_node_to_number[node] if not search_lol_id else node
         neighbors_list = self._graph.neighbors_unweighted(node_num)
-
-        neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16)
+        neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16)
         for i, neighbor in enumerate(neighbors_list):
             neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor)
 
diff --git a/grma/match/lol_graph.pyx b/grma/match/lol_graph.pyx
@@ -155,14 +155,14 @@ cdef class LolGraph:
         cdef UINT16[:] arr
         cdef np.ndarray[UINT16, ndim=2] neighbors_value
         cdef UINT num_of_neighbors_2nd
+        cdef UINT loci_len
 
         idx = self._index_list[node]
         idx_end = self._index_list[node + 1]
 
         neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32)
         for i in range(idx, idx_end):
             neighbors_list_id[i - idx] = self._neighbors_list[i]
-
         num_of_neighbors_2nd = <UINT>self._weights_list[idx]
 
         neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32)
@@ -177,11 +177,12 @@ cdef class LolGraph:
                 neighbors_id[pointer] = self._neighbors_list[j]
                 pointer += 1
 
-        neighbors_value = np.zeros((num_of_neighbors_2nd, 10), dtype=np.uint16)
+        loci_len = <UINT> self._map_number_to_arr_node.shape[1]
+        neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16)
         for i in range(len(neighbors_id)):
             neighbor_id = neighbors_id[i]
             arr = self.arr_node_value_from_id(neighbor_id)
-            for j in range(10):
+            for j in range(loci_len):
                 neighbors_value[i, j] = arr[j]
 
         return neighbors_id, neighbors_value
diff --git a/grma/match/match.py b/grma/match/match.py
@@ -8,6 +8,7 @@
 import pandas as pd
 from grim import grim
 import csv
+import ast
 
 from grma.match import Graph as MatchingGraph
 from grma.match.donors_matching import DonorsMatching, _init_results_df
@@ -195,6 +196,9 @@ def find_matches(
 
     # the returned dictionary. {patient ID: pd.DataFrame(matches + features)}
     patients_results = {patient: None for patient in patients}
+    with open(imputation_filename, "r") as f:
+        line = f.readline().strip()
+        loci = list(dict.fromkeys([item for item in line.split(',')[1].replace('*', ',').replace('+', ',').replace('^', ',').replace(':', ',').split(',') if not item.isdigit() and item]))
 
     if patients:
         avg_build_time = (end_build_graph - start_build_graph) / len(patients)
@@ -217,6 +221,23 @@ def find_matches(
             patient, g_m, donors_info, threshold, cutoff, classes, subclasses
         )
 
+
+        match_Probability = results_df["Match_Probability"]
+        match_Between_Most_Commons = results_df["Match_Between_Most_Commons"]
+        Permissive = results_df["Permissive/Non-Permissive"]
+        df_new = results_df.drop(columns=['Match_Probability', 'Match_Between_Most_Commons', 'Permissive/Non-Permissive']).copy()
+
+        for idx, row in enumerate(match_Probability):
+            k = 0
+            for locus in loci:
+                for i in [1, 2]:
+                    df_new.loc[idx, f"Match_Probability_{locus}_{i}"] = row[k]
+                    k += 1
+        df_new['Permissive/Non-Permissive'] = Permissive
+        for idx, row in enumerate(match_Between_Most_Commons):
+            for l, locus in enumerate(loci):
+                df_new.loc[idx, f"Match_Between_Most_Commons_{locus}"] = row[l]
+        results_df = df_new.copy()
         end = time.time()
         patient_time = end - start + avg_build_time
 
@@ -239,7 +260,6 @@ def find_matches(
                     f"Saved Matching results for {patient} in "
                     f"{os.path.join(f'{output_dir}/search_{search_id}', f'Patient_{patient}.csv')}"
                 )
-
     return patients_results
 
 
diff --git a/grma/utilities/cutils.pyx b/grma/utilities/cutils.pyx
@@ -83,7 +83,7 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati
             if counted - count_similar > 3:
                 similarities[i] = -1
                 break
-        if 10 - count_similar > 3:
+        if len(donors_geno) - count_similar > 3:
             similarities[i] = -1
         else:
             similarities[i] = count_similar