diff --git a/kg_microbe/transform_utils/constants.py b/kg_microbe/transform_utils/constants.py index 7cdd98c2..53868615 100644 --- a/kg_microbe/transform_utils/constants.py +++ b/kg_microbe/transform_utils/constants.py @@ -596,6 +596,10 @@ ] HGNC_OLD_PREFIX = "http://identifiers.org/hgnc/" +HGNC_GENENAMES_PREFIX = "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:" +HGNC_GENETYPE_PREFIX = "http://ontology.scibite.com/ontology/hgnc/SHGNC_" +HGNC_GENEGROUP_PREFIX = "https://www.genenames.org/data/genegroup/#!/group/" +HGNC_GENEPROPERTY_PREFIX = "http://ontology.scibite.com/property/" HGNC_NEW_PREFIX = "HGNC:" # Create a mapping for special cases @@ -611,6 +615,9 @@ UNIPATHWAYS_REACTION_PREFIX: re.sub(r"OBO:UPa_(\w{3})", r"UPA:\1", UNIPATHWAYS_REACTION_PREFIX), UNIPATHWAYS_PATHWAY_PREFIX: re.sub(r"OBO:UPa_(\w{3})", r"UPA:\1", UNIPATHWAYS_PATHWAY_PREFIX), HGNC_OLD_PREFIX: HGNC_NEW_PREFIX, + HGNC_GENENAMES_PREFIX: HGNC_NEW_PREFIX, + HGNC_GENETYPE_PREFIX: HGNC_NEW_PREFIX, + HGNC_GENEGROUP_PREFIX: HGNC_NEW_PREFIX, } # CTD @@ -620,7 +627,7 @@ CTD_DISEASE_OMIM_COLUMN = "OmimIDs" CHEMICAL_TO_DISEASE_EDGE = "biolink:associated_with" MESH_PREFIX = "MESH:" -NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes?curie=" +NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.5/get_normalized_nodes?curie=" MONDO_PREFIX = "MONDO:" # Disbiome diff --git a/kg_microbe/transform_utils/ontologies/ontologies_transform.py b/kg_microbe/transform_utils/ontologies/ontologies_transform.py index d8ec7e8d..9190f166 100644 --- a/kg_microbe/transform_utils/ontologies/ontologies_transform.py +++ b/kg_microbe/transform_utils/ontologies/ontologies_transform.py @@ -21,6 +21,7 @@ ENABLED_BY_RELATION, EXCLUSION_TERMS_FILE, GO_PREFIX, + HGNC_GENEPROPERTY_PREFIX, ID_COLUMN, MONDO_XREFS_FILEPATH, NCBITAXON_PREFIX, @@ -67,15 +68,16 @@ from ..transform import Transform ONTOLOGIES_MAP = { - "ncbitaxon": "ncbitaxon.owl.gz", - "chebi": "chebi.owl.gz", - "envo": "envo.json", - "go": "go.json", - ## "rhea": "rhea.json.gz", # Redundant to RheaMappingsTransform + # "ncbitaxon": "ncbitaxon.owl.gz", + # "chebi": "chebi.owl.gz", + # "envo": "envo.json", + # "go": "go.json", + # ## "rhea": "rhea.json.gz", # Redundant to RheaMappingsTransform "ec": "ec.json", - "upa": "upa.owl", - "mondo": "mondo.json", - "hp": "hp.json", + # "upa": "upa.owl", + # "mondo": "mondo.json", + # "hp": "hp.json", + "hgnc": "hgnc.owl" } @@ -175,7 +177,7 @@ def parse(self, name: str, data_file: Optional[Path], source: str) -> None: output=self.output_dir / name, output_format="tsv", ) - if name in ["ec", "upa", "chebi", "mondo"]: # removed "uniprot", "rhea" + if name in ["ec", "upa", "chebi", "mondo", "hgnc"]: # removed "uniprot", "rhea" self.post_process(name) @@ -431,7 +433,7 @@ def _replace_quotation_marks(line, description_index): for line in new_edge_lines: new_ef.write(line) - if name == "ec": # or name == "rhea": + if name == "ec" or name == "hgnc": # or name == "rhea": with open(nodes_file, "r") as nf, open(edges_file, "r") as ef: # Update prefixes in nodes file new_nf_lines = [] @@ -449,15 +451,17 @@ def _replace_quotation_marks(line, description_index): # Update prefixes in edges file new_ef_lines = [] for line in ef: - if line.startswith("id"): - continue - else: + if not line.startswith("id"): line = _replace_special_prefixes(line) - new_ef_lines.append(line) + new_ef_lines.append(line) if name == "ec": # Remove Uniprot nodes since accounted for elsewhere new_nf_lines = [line for line in new_nf_lines if UNIPROT_PREFIX not in line] new_ef_lines = [line for line in new_ef_lines if UNIPROT_PREFIX not in line] + elif name == "hgnc": + # Remove Property nodes + new_nf_lines = [line for line in new_nf_lines if HGNC_GENEPROPERTY_PREFIX not in line] + new_ef_lines = [line for line in new_ef_lines if HGNC_GENEPROPERTY_PREFIX not in line] # elif name == "rhea": # # Remove debio nodes that account for direction, since already there in inverse triples # # Note that CHEBI and EC predicates do not match Rhea pyobo, so removing them @@ -475,7 +479,7 @@ def _replace_quotation_marks(line, description_index): # Rewrite edges file with open(edges_file, "w") as new_ef: - new_ef.write("\t".join(self.edge_header) + "\n") + # new_ef.write("\t".join(self.edge_header) + "\n") for line in new_ef_lines: new_ef.write(line) diff --git a/kg_microbe/transform_utils/uniprot_human/uniprot_human.py b/kg_microbe/transform_utils/uniprot_human/uniprot_human.py index fd7aba03..243f53a2 100644 --- a/kg_microbe/transform_utils/uniprot_human/uniprot_human.py +++ b/kg_microbe/transform_utils/uniprot_human/uniprot_human.py @@ -60,7 +60,7 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu os.makedirs(UNIPROT_HUMAN_TMP_DIR, exist_ok=True) os.makedirs(UNIPROT_HUMAN_TMP_NE_DIR, exist_ok=True) go_category_trees_dict = prepare_go_dictionary() - mondo_xrefs_dict, mondo_gene_dict = prepare_mondo_dictionary() + mondo_xrefs_dict = prepare_mondo_dictionary() # make directory in data/transformed os.makedirs(self.output_dir, exist_ok=True) @@ -82,7 +82,6 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu self.output_edge_file, go_category_trees_dict, mondo_xrefs_dict, - mondo_gene_dict, OBSOLETE_TERMS_CSV_FILE, UNIPROT_HUMAN_RELEVANT_FILE_LIST, UNIPROT_HUMAN_TMP_NE_DIR, diff --git a/kg_microbe/transform_utils/wallen_etal/wallen_etal.py b/kg_microbe/transform_utils/wallen_etal/wallen_etal.py index 9609c220..3b1c84e5 100644 --- a/kg_microbe/transform_utils/wallen_etal/wallen_etal.py +++ b/kg_microbe/transform_utils/wallen_etal/wallen_etal.py @@ -61,7 +61,7 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu wallen_etal_df = pd.read_excel(input_file, skiprows=3, sheet_name=WALLEN_ETAL_TAB_NAME) wallen_etal_df[FDR_COLUMN] = pd.to_numeric(wallen_etal_df[FDR_COLUMN], errors="coerce") - + import pdb;pdb.set_trace() significant_wallenetal_df = wallen_etal_df[ wallen_etal_df[FDR_COLUMN].apply(lambda x: isinstance(x, float)) ] diff --git a/kg_microbe/utils/uniprot_utils.py b/kg_microbe/utils/uniprot_utils.py index 26619af1..366484f4 100644 --- a/kg_microbe/utils/uniprot_utils.py +++ b/kg_microbe/utils/uniprot_utils.py @@ -9,6 +9,7 @@ from pathlib import Path import pandas as pd +import requests from tqdm import tqdm from kg_microbe.transform_utils.constants import ( @@ -20,6 +21,7 @@ EC_CATEGORY, EC_PREFIX, ENABLES, + GENE_CATEGORY, GENE_TO_PROTEIN_EDGE, GO_BIOLOGICAL_PROCESS_ID, GO_BIOLOGICAL_PROCESS_LABEL, @@ -37,6 +39,7 @@ MONDO_XREFS_FILEPATH, NCBI_CATEGORY, NCBITAXON_PREFIX, + NODE_NORMALIZER_URL, OMIM_PREFIX, ONTOLOGIES_TREES_DIR, PARTICIPATES_IN, @@ -98,6 +101,7 @@ RHEA_PARSED_COLUMN = "rhea_parsed" DISEASE_PARSED_COLUMN = "disease_parsed" GENE_PRIMARY_PARSED_COLUMN = "gene_primary_parsed" +GENE_NAME_PRIMARY_PARSED_COLUMN = "gene_name_primary_parsed" GO_TERM_COLUMN = "GO_Term" GO_CATEGORY_COLUMN = "GO_Category" UNIPROT_ID_COLUMN = "Uniprot_ID" @@ -106,6 +110,29 @@ CHEBI_REGEX = re.compile(r'/ligand_id="ChEBI:(.*?)";') GO_REGEX = re.compile(r"\[(.*?)\]") +# Takes cure in the form PREFIX:ID +def normalize_node_api(node_curie): + + url = NODE_NORMALIZER_URL + node_curie + + # Make the HTTP request to NodeNormalizer + response = requests.get(url, timeout=30) + response.raise_for_status() + + # Write response to file if it contains data + entries = response.json()[node_curie] + try: + if len(entries) > 1: # .strip().split("\n") + for iden in entries["equivalent_identifiers"]: + if iden["identifier"].split(":")[0] + ":" == HGNC_NEW_PREFIX: + norm_node = iden["identifier"] + return norm_node + # Handle case where node normalizer returns nothing + except TypeError: + return None + + else: + return None def is_float(entry): """Determine if value is float, returns True/False.""" @@ -213,7 +240,7 @@ def parse_disease(disease_entry, mondo_xref_dict): return mondo_list -def parse_gene(gene_entry, mondo_gene_dict): +def parse_gene(gene_entry, protein_entry): """ Get gene ID from gene name entry. @@ -226,8 +253,10 @@ def parse_gene(gene_entry, mondo_gene_dict): """ gene_id = None if not is_float(gene_entry): - gene_name = gene_entry - gene_id = next((key for key, val in mondo_gene_dict.items() if val == gene_name), None) + # gene_name = gene_entry + # gene_id = next((key for key, val in mondo_gene_dict.items() if val == gene_name), None) + # if not gene_id: + gene_id = normalize_node_api(protein_entry) return gene_id @@ -289,7 +318,6 @@ def get_nodes_and_edges( uniprot_df, go_category_trees_dictionary, mondo_xrefs_dict, - mondo_gene_dict, obsolete_terms_csv_file, ): """ @@ -321,6 +349,7 @@ def get_nodes_and_edges( parsed_columns += [ DISEASE_PARSED_COLUMN, GENE_PRIMARY_PARSED_COLUMN, + GENE_NAME_PRIMARY_PARSED_COLUMN ] uniprot_parse_df = pd.DataFrame(columns=parsed_columns) uniprot_parse_df[ORGANISM_PARSED_COLUMN] = uniprot_df[UNIPROT_ORG_ID_COLUMN_NAME].apply( @@ -349,10 +378,20 @@ def get_nodes_and_edges( uniprot_parse_df[DISEASE_PARSED_COLUMN] = uniprot_df[UNIPROT_DISEASE_COLUMN_NAME].apply( lambda x: parse_disease(x, mondo_xrefs_dict) ) + # if UNIPROT_GENE_PRIMARY_COLUMN_NAME in uniprot_df.columns: + # uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df[ + # UNIPROT_GENE_PRIMARY_COLUMN_NAME + # ].apply(lambda x: parse_gene(x, mondo_gene_dict)) if UNIPROT_GENE_PRIMARY_COLUMN_NAME in uniprot_df.columns: - uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df[ - UNIPROT_GENE_PRIMARY_COLUMN_NAME - ].apply(lambda x: parse_gene(x, mondo_gene_dict)) + uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df.apply( + lambda row: parse_gene( + row[UNIPROT_GENE_PRIMARY_COLUMN_NAME], + UNIPROT_PREFIX + str(row[UNIPROT_PROTEIN_ID_COLUMN_NAME] + ) + if pd.notna(row[UNIPROT_PROTEIN_ID_COLUMN_NAME]) + else ""), + axis=1 + ) for _, entry in uniprot_parse_df.iterrows(): # Organism node @@ -502,28 +541,28 @@ def prepare_mondo_dictionary(): for row in csv_reader: if OMIM_PREFIX in row["xref"]: mondo_xrefs_dict[row["id"]] = row["xref"] - # Read MONDO nodes file for gene names - mondo_gene_dict = {} - if MONDO_GENE_IDS_FILEPATH.exists(): - with open(MONDO_GENE_IDS_FILEPATH, "r") as file: - csv_reader = csv.DictReader(file, delimiter="\t") - for row in csv_reader: - mondo_gene_dict[row["id"]] = row["name"] - #! TODO: use oak - else: - mondo_nodes_file = ( - Path(__file__).parents[2] / "data" / "transformed" / "ontologies" / "mondo_nodes.tsv" - ) - if mondo_nodes_file.exists(): - with open(mondo_nodes_file, "r") as file: - csv_reader = csv.DictReader(file, delimiter="\t") - for row in csv_reader: - if HGNC_NEW_PREFIX in row["id"]: - mondo_gene_dict[row["id"]] = row["name"] - mondo_gene_df = pd.DataFrame(list(mondo_gene_dict.items()), columns=["id", "name"]) - mondo_gene_df.to_csv(MONDO_GENE_IDS_FILEPATH, sep="\t", index=False) - return mondo_xrefs_dict, mondo_gene_dict + return mondo_xrefs_dict + # # Read MONDO nodes file for gene names + # mondo_gene_dict = {} + # if MONDO_GENE_IDS_FILEPATH.exists(): + # with open(MONDO_GENE_IDS_FILEPATH, "r") as file: + # csv_reader = csv.DictReader(file, delimiter="\t") + # for row in csv_reader: + # mondo_gene_dict[row["id"]] = row["name"] + # #! TODO: use oak + # else: + # mondo_nodes_file = ( + # Path(__file__).parents[2] / "data" / "transformed" / "ontologies" / "mondo_nodes.tsv" + # ) + # if mondo_nodes_file.exists(): + # with open(mondo_nodes_file, "r") as file: + # csv_reader = csv.DictReader(file, delimiter="\t") + # for row in csv_reader: + # if HGNC_NEW_PREFIX in row["id"]: + # mondo_gene_dict[row["id"]] = row["name"] + # mondo_gene_df = pd.DataFrame(list(mondo_gene_dict.items()), columns=["id", "name"]) + # mondo_gene_df.to_csv(MONDO_GENE_IDS_FILEPATH, sep="\t", index=False) def process_lines( @@ -537,7 +576,6 @@ def process_lines( progress_class, go_category_dictionary, mondo_xrefs_dict, - mondo_gene_dict, obsolete_terms_csv_file, ): """ @@ -566,7 +604,6 @@ def process_lines( df, go_category_dictionary, mondo_xrefs_dict, - mondo_gene_dict, obsolete_terms_csv_file, ) # Write node and edge data to unique files @@ -694,7 +731,6 @@ def create_pool( output_edge_file, go_category_trees_dict, mondo_xrefs_dict, - mondo_gene_dict, obsolete_terms_csv_file, uniprot_relevant_file_list, uniprot_tmp_ne_dir, @@ -726,7 +762,6 @@ def create_pool( progress_class, go_category_trees_dict, mondo_xrefs_dict, - mondo_gene_dict, obsolete_terms_csv_file, ) for line_chunk in line_chunks