diff --git a/README.md b/README.md index 5ed88c5..977a919 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ py-graph-match Matching with Graph -`grma`` is a package for finding HLA matches using graphs approach. +`grma` is a package for finding HLA matches using graphs approach. The matching is based on [grim's](https://github.com/nmdp-bioinformatics/py-graph-imputation) imputation. diff --git a/data/donors_dir/donors.txt b/data/donors_dir/donors.txt old mode 100644 new mode 100755 diff --git a/grma/__init__.py b/grma/__init__.py index d8319e8..2f3da52 100644 --- a/grma/__init__.py +++ b/grma/__init__.py @@ -1,2 +1,2 @@ from grma.donorsgraph.build_donors_graph import BuildMatchingGraph -from grma.match import matching, find_matches +from grma.match.match import matching, find_matches diff --git a/grma/donorsgraph/build_donors_graph.py b/grma/donorsgraph/build_donors_graph.py index 303867f..66b06c4 100644 --- a/grma/donorsgraph/build_donors_graph.py +++ b/grma/donorsgraph/build_donors_graph.py @@ -12,8 +12,6 @@ from grma.utilities.geno_representation import HashableArray from grma.utilities.utils import gl_string_to_integers, tuple_geno_to_int, print_time -CLASS_I_END = 6 - class BuildMatchingGraph: """ @@ -111,7 +109,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): geno = gl_string_to_integers(geno) # sort alleles for each HLA-X - for x in range(0, 10, 2): + for x in range(0, len(geno), 2): geno[x : x + 2] = sorted(geno[x : x + 2]) geno = HashableArray(geno) @@ -137,6 +135,7 @@ def _save_graph_as_edges(self, path_to_donors_directory: str | os.PathLike): # continue creation of classes and subclasses if geno not in layers["GENOTYPE"]: layers["GENOTYPE"].add(geno) + CLASS_I_END = -2 * int(-len(geno)/4 - 0.5) geno_class1 = tuple(geno[:CLASS_I_END]) geno_class2 = tuple(geno[CLASS_I_END:]) self._create_classes_edges(geno, geno_class1, layers) diff --git a/grma/donorsgraph/create_lol.py b/grma/donorsgraph/create_lol.py index ce861c0..8365227 100644 --- a/grma/donorsgraph/create_lol.py +++ b/grma/donorsgraph/create_lol.py @@ -76,8 +76,10 @@ def _convert(self, layers: Dict[str, Set]): arrays_start = free # map lol-ids to arrays # given an lol_id, the mapping will be map_number_to_arr_node[lol_id - arrays_start, :] + geno = layers['GENOTYPE'].pop() + layers['GENOTYPE'].add(geno) map_number_to_arr_node = np.zeros( - (len(layers["GENOTYPE"]), 10), dtype=np.uint16 + (len(layers["GENOTYPE"]), len(geno)), dtype=np.uint16 ) for i, geno in tqdm( enumerate(layers["GENOTYPE"]), diff --git a/grma/match/donors_matching.py b/grma/match/donors_matching.py index bbeb06e..cbd0dc2 100644 --- a/grma/match/donors_matching.py +++ b/grma/match/donors_matching.py @@ -20,8 +20,6 @@ DONORS_DB: pd.DataFrame = pd.DataFrame() ZEROS: HashableArray = HashableArray([0]) -ALLELES_IN_CLASS_I: int = 6 -ALLELES_IN_CLASS_II: int = 4 def set_database(donors_db: pd.DataFrame = pd.DataFrame()): @@ -39,24 +37,13 @@ def _init_results_df(donors_info): fields_in_results = { "Patient_ID": [], "Donor_ID": [], + "GvH_Mismatches": [], + "HvG_Mismatches": [], "Number_Of_Mismatches": [], "Matching_Probability": [], - "Match_Probability_A_1": [], - "Match_Probability_A_2": [], - "Match_Probability_B_1": [], - "Match_Probability_B_2": [], - "Match_Probability_C_1": [], - "Match_Probability_C_2": [], - "Match_Probability_DQB1_1": [], - "Match_Probability_DQB1_2": [], - "Match_Probability_DRB1_1": [], - "Match_Probability_DRB1_2": [], + "Match_Probability": [], "Permissive/Non-Permissive": [], - "Match_Between_Most_Commons_A": [], - "Match_Between_Most_Commons_B": [], - "Match_Between_Most_Commons_C": [], - "Match_Between_Most_Commons_DQB": [], - "Match_Between_Most_Commons_DRB": [], + "Match_Between_Most_Commons": [], } donors_db_fields = DONORS_DB.columns.values.tolist() @@ -66,17 +53,26 @@ def _init_results_df(donors_info): return pd.DataFrame(fields_in_results) -def locuses_match_between_genos(geno1, geno2): +def locuses_match_between_genos(geno_pat, geno_don): matches = [] - for i in range(5): - a1, b1 = geno1[2 * i], geno1[2 * i + 1] - a2, b2 = geno2[2 * i], geno2[2 * i + 1] + total_gvh = 0 + total_hvg = 0 + + for i in range(0, len(geno_pat), 2): + a1, b1 = geno_pat[i], geno_pat[i + 1] + a2, b2 = geno_don[i], geno_don[i + 1] s1 = int(a1 == a2) + int(b1 == b2) s2 = int(a1 == b2) + int(b1 == a2) matches.append(max(s1, s2)) - return matches + p_set = {x for x in (a1, b1) if x not in (None, 0)} + d_set = {x for x in (a2, b2) if x not in (None, 0)} + + total_gvh += len(p_set - d_set) # patient has, donor lacks + total_hvg += len(d_set - p_set) # donor has, patient lacks + + return matches, total_gvh, total_hvg class DonorsMatching(object): @@ -127,7 +123,7 @@ def probability_to_allele( ) -> List[float]: """Takes a donor ID and a genotype. Returns the probability of match for each allele""" - probs = [0 for _ in range(10)] + probs = [0 for _ in range(len(pat_geno))] for i, allele in enumerate(pat_geno): p = 0 @@ -148,7 +144,7 @@ def __find_genotype_candidates_from_class( ) -> Tuple[np.ndarray, np.ndarray]: """Takes an integer subclass. Returns the genotypes (ids and values) which are connected to it in the graph""" - return self._graph.class_neighbors(clss) + return self._graph.class_neighbors(clss, Len = len(list(self.patients.values())[0])) def __find_donor_from_geno(self, geno_id: int) -> Sequence[int]: """Gets the LOL ID of a genotype. @@ -216,12 +212,14 @@ def __add_matched_genos_to_graph( def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): subclasses = [] - classes = [genotype[:ALLELES_IN_CLASS_I], genotype[ALLELES_IN_CLASS_I:]] + ALLELES_IN_CLASS_I = -2*int(-len(genotype)/4-0.5) + ALLELES_IN_CLASS_II = len(genotype) - ALLELES_IN_CLASS_I + classes = [(genotype[:ALLELES_IN_CLASS_I], 0), (genotype[ALLELES_IN_CLASS_I:], 1)] num_of_alleles_in_class = [ALLELES_IN_CLASS_I, ALLELES_IN_CLASS_II] - int_classes = [tuple_geno_to_int(tuple(clss)) for clss in classes] + int_classes = [(tuple_geno_to_int(tuple(clss[0])), clss[1]) for clss in classes] for clss in int_classes: - self._patients_graph.add_edge(clss, genotype) + self._patients_graph.add_edge(clss[0], genotype) # class one is considered as 0. # class two is considered as 1. @@ -231,14 +229,14 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): # set the missing allele to always be the second allele in the locus if k % 2 == 0: sub = tuple_geno_to_int( - classes[class_num][0:k] + ZEROS + classes[class_num][k + 1 :] + classes[class_num][0][0:k] + ZEROS + classes[class_num][0][k + 1 :] ) else: sub = tuple_geno_to_int( - classes[class_num][0 : k - 1] + classes[class_num][0][0 : k - 1] + ZEROS - + classes[class_num][k - 1 : k] - + classes[class_num][k + 1 :] + + classes[class_num][0][k - 1 : k] + + classes[class_num][0][k + 1 :] ) # missing allele number is the index of the first allele of the locus the missing allele belongs to. @@ -255,6 +253,8 @@ def __classes_and_subclasses_from_genotype(self, genotype: HashableArray): return int_classes, subclasses + + def create_patients_graph(self, f_patients: str): """ create patients graph. \n @@ -300,7 +300,8 @@ def create_patients_graph(self, f_patients: str): classes_by_patient[patient_id] = set() # sort alleles for each HLA-X - for x in range(0, 10, 2): + l = len(geno) + for x in range(0, l, 2): geno[x : x + 2] = sorted(geno[x : x + 2]) geno = HashableArray(geno) @@ -351,15 +352,14 @@ def find_geno_candidates_by_subclasses(self, subclasses): genotypes_value, ) = self.__find_genotype_candidates_from_subclass(subclass.subclass) + geno = genotypes_value[0] + ALLELES_IN_CLASS_I = -2*int(-len(geno)/4-0.5) + ALLELES_IN_CLASS_II = len(geno) - ALLELES_IN_CLASS_I # Checks only the locuses that are not certain to match if subclass.class_num == 0: - allele_range_to_check = np.array( - [6, 8, subclass.allele_num], dtype=np.uint8 - ) + allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)] + [subclass.allele_num], dtype=np.uint8) else: - allele_range_to_check = np.array( - [0, 2, 4, subclass.allele_num], dtype=np.uint8 - ) + allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)] + [subclass.allele_num], dtype=np.uint8) # number of alleles that already match due to match in subclass matched_alleles: int = ( @@ -383,9 +383,9 @@ def find_geno_candidates_by_classes(self, classes): desc="finding classes matching candidates", disable=not self.verbose, ): - if self._graph.in_nodes(clss): + if self._graph.in_nodes(clss[0]): patient_genos = self._patients_graph.neighbors( - clss + clss[0] ) # The patient's genotypes which might be match ( genotypes_ids, @@ -395,12 +395,14 @@ def find_geno_candidates_by_classes(self, classes): # Checks only the locuses that are not certain to match (the locuses of the other class) # Class I appearances: 3 locuses = 6 alleles = 23/24 digits # Class II appearances: 2 locuses = 4 alleles = 15/16 digits - if len(str(clss)) > 20: - allele_range_to_check = np.array([6, 8], dtype=np.uint8) - matched_alleles: int = 6 + geno = list(self.patients.values())[0] + if clss[1] == 0: + allele_range_to_check = np.array([x for x in range(len(geno)//2 + (len(geno)//2 & 1), len(geno), 2)], dtype=np.uint8) + matched_alleles: int = len(geno)//2 + (len(geno)//2 & 1) + else: - allele_range_to_check = np.array([0, 2, 4], dtype=np.uint8) - matched_alleles: int = 4 + allele_range_to_check = np.array([x for x in range(0, len(geno)//2 + (len(geno)//2 & 1), 2)], dtype=np.uint8) + matched_alleles: int = len(geno)//2 - (len(geno)//2 & 1) # Compares the candidate to the patient's genotypes, and adds the match geno candidates to the graph. self.__add_matched_genos_to_graph( @@ -441,7 +443,7 @@ def find_geno_candidates_by_genotypes(self, patient_id: int): # and each patient connects only to their own genos, so we wouldn't override the weight dict. # self._patients_graph.add_edge(patient_id, geno_id, weight={geno_num: [probability, 10]}) # AMIT DELETE self._genotype_candidates[patient_id][geno_id] = { - geno_num: (probability, 10) + geno_num: (probability, len(geno)) } # AMIT ADD # else: # print(f"Missing 'geno_num' for patient_id: {patient_id}") @@ -507,7 +509,7 @@ def score_matches( ].items(): # AMIT ADD for prob, matches in genotype_matches.values(): # AMIT CHANGE # match_info = (probability of patient's genotype, number of matches to patient's genotype) - if matches != 10 - mismatch: + if matches != len(list(self.patients.values())[0]) - mismatch: continue # add the probabilities multiplication of the patient and all the donors that has this genotype @@ -568,9 +570,10 @@ def __append_matching_donor( mm_number: int, ) -> None: """add a donor to the matches dictionary""" - - compare_commons = locuses_match_between_genos( - self.patients[patient], self.get_most_common_genotype(donor) + pat = self.patients[patient] + don = self.get_most_common_genotype(donor) + compare_commons, gvh, hvg = locuses_match_between_genos( + pat, don ) add_donors["Patient_ID"].append(patient) @@ -578,28 +581,21 @@ def __append_matching_donor( allele_prob = self.probability_to_allele( don_id=donor, pat_geno=self.patients[patient] ) - add_donors["Match_Probability_A_1"].append(allele_prob[0]) - add_donors["Match_Probability_A_2"].append(allele_prob[1]) - add_donors["Match_Probability_B_1"].append(allele_prob[2]) - add_donors["Match_Probability_B_2"].append(allele_prob[3]) - add_donors["Match_Probability_C_1"].append(allele_prob[4]) - add_donors["Match_Probability_C_2"].append(allele_prob[5]) - add_donors["Match_Probability_DQB1_1"].append(allele_prob[6]) - add_donors["Match_Probability_DQB1_2"].append(allele_prob[7]) - add_donors["Match_Probability_DRB1_1"].append(allele_prob[8]) - add_donors["Match_Probability_DRB1_2"].append(allele_prob[9]) - - add_donors["Match_Between_Most_Commons_A"].append(compare_commons[0]) - add_donors["Match_Between_Most_Commons_B"].append(compare_commons[1]) - add_donors["Match_Between_Most_Commons_C"].append(compare_commons[2]) - add_donors["Match_Between_Most_Commons_DQB"].append(compare_commons[3]) - add_donors["Match_Between_Most_Commons_DRB"].append(compare_commons[4]) + add_donors["Match_Probability"].append(allele_prob) + add_donors["Match_Between_Most_Commons"].append(compare_commons) add_donors["Matching_Probability"].append(match_prob) - add_donors["Number_Of_Mismatches"].append(mm_number) - add_donors["Permissive/Non-Permissive"].append( - "-" - ) # TODO: add permissiveness algorithm + actual_mismatches = 0 + for match_score in compare_commons: + if match_score != 2: + actual_mismatches += (2 - match_score) + + add_donors["Number_Of_Mismatches"].append(actual_mismatches) + add_donors["GvH_Mismatches"].append(gvh) + add_donors["HvG_Mismatches"].append(hvg) + + add_donors["Permissive/Non-Permissive"].append("-") + # TODO: add permissiveness algorithm # add the other given fields to the results for field in donors_info: diff --git a/grma/match/graph_wrapper.py b/grma/match/graph_wrapper.py index c855ae5..6f1fe07 100644 --- a/grma/match/graph_wrapper.py +++ b/grma/match/graph_wrapper.py @@ -5,7 +5,6 @@ from typing import Union import numpy as np - from grma.utilities.geno_representation import HashableArray from grma.match.lol_graph import LolGraph @@ -58,11 +57,10 @@ def get_edge_data( ret = self._graph.get_edge_data(node1_num, node2_num) return default if ret == exception_val else ret - def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False): - node_num = self._map_node_to_number[node] if not search_lol_id else node + def class_neighbors(self, node: NODES_TYPES | int, search_lol_id: bool = False, Len: int = 10): + node_num = self._map_node_to_number[node[0]] if not search_lol_id else node neighbors_list = self._graph.neighbors_unweighted(node_num) - - neighbors_list_values = np.ndarray([len(neighbors_list), 10], dtype=np.uint16) + neighbors_list_values = np.ndarray([len(neighbors_list), Len], dtype=np.uint16) for i, neighbor in enumerate(neighbors_list): neighbors_list_values[i, :] = self._graph.arr_node_value_from_id(neighbor) @@ -112,7 +110,7 @@ def neighbors( def neighbors_2nd(self, node): node_num = self._map_node_to_number[node] r0, r1 = self._graph.neighbors_2nd(node_num) - return r0[:-1], r1 + return r0, r1 def node_value_from_id(self, node_id: int) -> NODES_TYPES: """convert lol ID to node value""" diff --git a/grma/match/lol_graph.pyx b/grma/match/lol_graph.pyx index 44c111f..ee2679f 100644 --- a/grma/match/lol_graph.pyx +++ b/grma/match/lol_graph.pyx @@ -155,6 +155,7 @@ cdef class LolGraph: cdef UINT16[:] arr cdef np.ndarray[UINT16, ndim=2] neighbors_value cdef UINT num_of_neighbors_2nd + cdef UINT loci_len idx = self._index_list[node] idx_end = self._index_list[node + 1] @@ -162,7 +163,6 @@ cdef class LolGraph: neighbors_list_id = np.zeros(idx_end - idx, dtype=np.uint32) for i in range(idx, idx_end): neighbors_list_id[i - idx] = self._neighbors_list[i] - num_of_neighbors_2nd = self._weights_list[idx] neighbors_id = np.zeros(int(num_of_neighbors_2nd), dtype=np.uint32) @@ -177,11 +177,12 @@ cdef class LolGraph: neighbors_id[pointer] = self._neighbors_list[j] pointer += 1 - neighbors_value = np.zeros((num_of_neighbors_2nd - 1, 10), dtype=np.uint16) - for i in range(len(neighbors_id) - 1): + loci_len = self._map_number_to_arr_node.shape[1] + neighbors_value = np.zeros((num_of_neighbors_2nd, loci_len), dtype=np.uint16) + for i in range(len(neighbors_id)): neighbor_id = neighbors_id[i] arr = self.arr_node_value_from_id(neighbor_id) - for j in range(10): + for j in range(loci_len): neighbors_value[i, j] = arr[j] return neighbors_id, neighbors_value diff --git a/grma/match/match.py b/grma/match/match.py index 302b71f..99a55ae 100644 --- a/grma/match/match.py +++ b/grma/match/match.py @@ -8,6 +8,8 @@ import pandas as pd from grim import grim import csv +import ast + from grma.match import Graph as MatchingGraph from grma.match.donors_matching import DonorsMatching, _init_results_df @@ -195,6 +197,9 @@ def find_matches( # the returned dictionary. {patient ID: pd.DataFrame(matches + features)} patients_results = {patient: None for patient in patients} + with open(imputation_filename, "r") as f: + line = f.readline().strip() + loci = list(dict.fromkeys([item for item in line.split(',')[1].replace('*', ',').replace('+', ',').replace('^', ',').replace(':', ',').split(',') if not item.isdigit() and item])) if patients: avg_build_time = (end_build_graph - start_build_graph) / len(patients) @@ -217,6 +222,23 @@ def find_matches( patient, g_m, donors_info, threshold, cutoff, classes, subclasses ) + + match_Probability = results_df["Match_Probability"] + match_Between_Most_Commons = results_df["Match_Between_Most_Commons"] + Permissive = results_df["Permissive/Non-Permissive"] + df_new = results_df.drop(columns=['Match_Probability', 'Match_Between_Most_Commons', 'Permissive/Non-Permissive']).copy() + + for idx, row in enumerate(match_Probability): + k = 0 + for locus in loci: + for i in [1, 2]: + df_new.loc[idx, f"Match_Probability_{locus}_{i}"] = row[k] + k += 1 + df_new['Permissive/Non-Permissive'] = Permissive + for idx, row in enumerate(match_Between_Most_Commons): + for l, locus in enumerate(loci): + df_new.loc[idx, f"Match_Between_Most_Commons_{locus}"] = row[l] + results_df = df_new.copy() end = time.time() patient_time = end - start + avg_build_time @@ -239,7 +261,6 @@ def find_matches( f"Saved Matching results for {patient} in " f"{os.path.join(f'{output_dir}/search_{search_id}', f'Patient_{patient}.csv')}" ) - return patients_results diff --git a/grma/utilities/cutils.pyx b/grma/utilities/cutils.pyx index b08b624..9072d3f 100644 --- a/grma/utilities/cutils.pyx +++ b/grma/utilities/cutils.pyx @@ -83,7 +83,7 @@ cpdef np.ndarray[INT8, ndim=1] ccheck_similarity(np.ndarray[UINT16, ndim=1] pati if counted - count_similar > 3: similarities[i] = -1 break - if 10 - count_similar > 3: + if len(donors_geno) - count_similar > 3: similarities[i] = -1 else: similarities[i] = count_similar diff --git a/my_project_template/__init__.py b/my_project_template/__init__.py index 9085028..fd2931e 100644 --- a/my_project_template/__init__.py +++ b/my_project_template/__init__.py @@ -26,4 +26,4 @@ """Top-level package for My Project Template.""" __organization__ = "NMDP/CIBMTR Bioinformatics" -__version__ = "0.0.8" +__version__ = "0.0.9" diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 0000000..440f358 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,38 @@ +[build-system] +requires = ["setuptools>=42", "wheel", "Cython>=0.29.32", "numpy>=1.20.2"] +build-backend = "setuptools.build_meta" + +[project] +name = "py-graph-match-temp" +version = "0.0.9" +description = "A Python package for graph matching." +readme = "README.md" +requires-python = ">=3.6" +license = { file = "LICENSE" } +authors = [ + { name = "Regev Yehezkel Imra", email = "regevel2006@gmail.com" } +] +dependencies = [ + "toml>=0.10.2", + "py-graph-imputation>=0.0.12", + "py-ard>=1.0.9", + "tqdm>=4.66.1", +] + +[project.optional-dependencies] +tests = [ + "pytest>=7.1.2", + "behave>=1.2.6", + "PyHamcrest>=2.0.3", +] +dev = [ + "allure-behave>=2.9.45", + "flake8>=4.0.1", + "bump2version>=1.0.1", + "coverage>=6.3.2", + "pre-commit>=2.18.1", +] +deploy = [ + "connexion[swagger-ui]>=2.13.0", + "gunicorn>=20.1.0", +] diff --git a/setup.cfg b/setup.cfg index 22a3b25..bdc047b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 0.0.8 +current_version = 0.0.9 commit = True tag = True diff --git a/setup.py b/setup.py index 03e97ae..d33e6fa 100644 --- a/setup.py +++ b/setup.py @@ -52,8 +52,8 @@ def build_include_dirs(): test_requirements = requirements_file.read().split("\n") setup( - name="py-graph-match", - version="0.0.8", + name="py-graph-match-temp", + version="0.0.9", author="Pradeep Bashyal", author_email="pbashyal@nmdp.org", python_requires=">=3.8",