Knowledge-Graph-Hub · bsantan · Feb 22, 2025 · Copilot · Aug 13, 2025 · Copilot
diff --git a/kg_microbe/transform_utils/constants.py b/kg_microbe/transform_utils/constants.py
@@ -596,6 +596,10 @@
 ]
 
 HGNC_OLD_PREFIX = "http://identifiers.org/hgnc/"
+HGNC_GENENAMES_PREFIX = "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:"
+HGNC_GENETYPE_PREFIX = "http://ontology.scibite.com/ontology/hgnc/SHGNC_"
+HGNC_GENEGROUP_PREFIX = "https://www.genenames.org/data/genegroup/#!/group/"
+HGNC_GENEPROPERTY_PREFIX = "http://ontology.scibite.com/property/"
 HGNC_NEW_PREFIX = "HGNC:"
 
 # Create a mapping for special cases
@@ -611,6 +615,9 @@
     UNIPATHWAYS_REACTION_PREFIX: re.sub(r"OBO:UPa_(\w{3})", r"UPA:\1", UNIPATHWAYS_REACTION_PREFIX),
     UNIPATHWAYS_PATHWAY_PREFIX: re.sub(r"OBO:UPa_(\w{3})", r"UPA:\1", UNIPATHWAYS_PATHWAY_PREFIX),
     HGNC_OLD_PREFIX: HGNC_NEW_PREFIX,
+    HGNC_GENENAMES_PREFIX: HGNC_NEW_PREFIX,
+    HGNC_GENETYPE_PREFIX: HGNC_NEW_PREFIX,
+    HGNC_GENEGROUP_PREFIX: HGNC_NEW_PREFIX,
 }
 
 # CTD
@@ -620,7 +627,7 @@
 CTD_DISEASE_OMIM_COLUMN = "OmimIDs"
 CHEMICAL_TO_DISEASE_EDGE = "biolink:associated_with"
 MESH_PREFIX = "MESH:"
-NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes?curie="
+NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.5/get_normalized_nodes?curie="
 MONDO_PREFIX = "MONDO:"
 
 # Disbiome

diff --git a/kg_microbe/transform_utils/ontologies/ontologies_transform.py b/kg_microbe/transform_utils/ontologies/ontologies_transform.py
@@ -21,6 +21,7 @@
     ENABLED_BY_RELATION,
     EXCLUSION_TERMS_FILE,
     GO_PREFIX,
+    HGNC_GENEPROPERTY_PREFIX,
     ID_COLUMN,
     MONDO_XREFS_FILEPATH,
     NCBITAXON_PREFIX,
@@ -67,15 +68,16 @@
 from ..transform import Transform
 
 ONTOLOGIES_MAP = {
-    "ncbitaxon": "ncbitaxon.owl.gz",
-    "chebi": "chebi.owl.gz",
-    "envo": "envo.json",
-    "go": "go.json",
-    ## "rhea": "rhea.json.gz", # Redundant to RheaMappingsTransform
+    # "ncbitaxon": "ncbitaxon.owl.gz",
+    # "chebi": "chebi.owl.gz",
+    # "envo": "envo.json",
+    # "go": "go.json",
+    # ## "rhea": "rhea.json.gz", # Redundant to RheaMappingsTransform
     "ec": "ec.json",
-    "upa": "upa.owl",
-    "mondo": "mondo.json",
-    "hp": "hp.json",
+    # "upa": "upa.owl",
+    # "mondo": "mondo.json",
+    # "hp": "hp.json",
+    "hgnc": "hgnc.owl"
 }
 
 
@@ -175,7 +177,7 @@ def parse(self, name: str, data_file: Optional[Path], source: str) -> None:
             output=self.output_dir / name,
             output_format="tsv",
         )
-        if name in ["ec", "upa", "chebi", "mondo"]:  # removed "uniprot", "rhea"
+        if name in ["ec", "upa", "chebi", "mondo", "hgnc"]:  # removed "uniprot", "rhea"
 
             self.post_process(name)
 
@@ -431,7 +433,7 @@ def _replace_quotation_marks(line, description_index):
                 for line in new_edge_lines:
                     new_ef.write(line)
 
-        if name == "ec":  # or name == "rhea":
+        if name == "ec" or name == "hgnc":  # or name == "rhea":
             with open(nodes_file, "r") as nf, open(edges_file, "r") as ef:
                 # Update prefixes in nodes file
                 new_nf_lines = []
@@ -449,15 +451,17 @@ def _replace_quotation_marks(line, description_index):
                 # Update prefixes in edges file
                 new_ef_lines = []
                 for line in ef:
-                    if line.startswith("id"):
-                        continue
-                    else:
+                    if not line.startswith("id"):
                         line = _replace_special_prefixes(line)
-                        new_ef_lines.append(line)
+                    new_ef_lines.append(line)
             if name == "ec":
                 # Remove Uniprot nodes since accounted for elsewhere
                 new_nf_lines = [line for line in new_nf_lines if UNIPROT_PREFIX not in line]
                 new_ef_lines = [line for line in new_ef_lines if UNIPROT_PREFIX not in line]
+            elif name == "hgnc":
+                # Remove Property nodes
+                new_nf_lines = [line for line in new_nf_lines if HGNC_GENEPROPERTY_PREFIX not in line]
+                new_ef_lines = [line for line in new_ef_lines if HGNC_GENEPROPERTY_PREFIX not in line]
             # elif name == "rhea":
             #     # Remove debio nodes that account for direction, since already there in inverse triples
             #     # Note that CHEBI and EC predicates do not match Rhea pyobo, so removing them
@@ -475,7 +479,7 @@ def _replace_quotation_marks(line, description_index):
 
             # Rewrite edges file
             with open(edges_file, "w") as new_ef:
-                new_ef.write("\t".join(self.edge_header) + "\n")
+                # new_ef.write("\t".join(self.edge_header) + "\n")
-                # new_ef.write("\t".join(self.edge_header) + "\n")
-                # new_ef.write("\t".join(self.edge_header) + "\n")
                 for line in new_ef_lines:
                     new_ef.write(line)
 

diff --git a/kg_microbe/transform_utils/uniprot_human/uniprot_human.py b/kg_microbe/transform_utils/uniprot_human/uniprot_human.py
@@ -60,7 +60,7 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
         os.makedirs(UNIPROT_HUMAN_TMP_DIR, exist_ok=True)
         os.makedirs(UNIPROT_HUMAN_TMP_NE_DIR, exist_ok=True)
         go_category_trees_dict = prepare_go_dictionary()
-        mondo_xrefs_dict, mondo_gene_dict = prepare_mondo_dictionary()
+        mondo_xrefs_dict = prepare_mondo_dictionary()
 
         # make directory in data/transformed
         os.makedirs(self.output_dir, exist_ok=True)
@@ -82,7 +82,6 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
             self.output_edge_file,
             go_category_trees_dict,
             mondo_xrefs_dict,
-            mondo_gene_dict,
             OBSOLETE_TERMS_CSV_FILE,
             UNIPROT_HUMAN_RELEVANT_FILE_LIST,
             UNIPROT_HUMAN_TMP_NE_DIR,

diff --git a/kg_microbe/transform_utils/wallen_etal/wallen_etal.py b/kg_microbe/transform_utils/wallen_etal/wallen_etal.py
@@ -61,7 +61,7 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
 
         wallen_etal_df = pd.read_excel(input_file, skiprows=3, sheet_name=WALLEN_ETAL_TAB_NAME)
         wallen_etal_df[FDR_COLUMN] = pd.to_numeric(wallen_etal_df[FDR_COLUMN], errors="coerce")
-
+        import pdb;pdb.set_trace()
-        import pdb;pdb.set_trace()
-        import pdb;pdb.set_trace()
         significant_wallenetal_df = wallen_etal_df[
             wallen_etal_df[FDR_COLUMN].apply(lambda x: isinstance(x, float))
         ]

diff --git a/kg_microbe/utils/uniprot_utils.py b/kg_microbe/utils/uniprot_utils.py
@@ -9,6 +9,7 @@
 from pathlib import Path
 
 import pandas as pd
+import requests
 from tqdm import tqdm
 
 from kg_microbe.transform_utils.constants import (
@@ -20,6 +21,7 @@
     EC_CATEGORY,
     EC_PREFIX,
     ENABLES,
+    GENE_CATEGORY,
     GENE_TO_PROTEIN_EDGE,
     GO_BIOLOGICAL_PROCESS_ID,
     GO_BIOLOGICAL_PROCESS_LABEL,
@@ -37,6 +39,7 @@
     MONDO_XREFS_FILEPATH,
     NCBI_CATEGORY,
     NCBITAXON_PREFIX,
+    NODE_NORMALIZER_URL,
     OMIM_PREFIX,
     ONTOLOGIES_TREES_DIR,
     PARTICIPATES_IN,
@@ -98,6 +101,7 @@
 RHEA_PARSED_COLUMN = "rhea_parsed"
 DISEASE_PARSED_COLUMN = "disease_parsed"
 GENE_PRIMARY_PARSED_COLUMN = "gene_primary_parsed"
+GENE_NAME_PRIMARY_PARSED_COLUMN = "gene_name_primary_parsed"
 GO_TERM_COLUMN = "GO_Term"
 GO_CATEGORY_COLUMN = "GO_Category"
 UNIPROT_ID_COLUMN = "Uniprot_ID"
@@ -106,6 +110,29 @@
 CHEBI_REGEX = re.compile(r'/ligand_id="ChEBI:(.*?)";')
 GO_REGEX = re.compile(r"\[(.*?)\]")
 
+# Takes cure in the form PREFIX:ID
-# Takes cure in the form PREFIX:ID
+# Takes curie in the form PREFIX:ID
-# Takes cure in the form PREFIX:ID
+# Takes curie in the form PREFIX:ID
+def normalize_node_api(node_curie):
+
+    url = NODE_NORMALIZER_URL + node_curie
+
+    # Make the HTTP request to NodeNormalizer
+    response = requests.get(url, timeout=30)
+    response.raise_for_status()
+
+    # Write response to file if it contains data
+    entries = response.json()[node_curie]
-    entries = response.json()[node_curie]
+    entries = response.json().get(node_curie)
+    if entries is None:
+        return None
-    entries = response.json()[node_curie]
+    entries = response.json().get(node_curie)
+    if entries is None:
+        return None
+    try:
+        if len(entries) > 1:  # .strip().split("\n")
-        if len(entries) > 1:  # .strip().split("\n")
+        if entries and "equivalent_identifiers" in entries and entries["equivalent_identifiers"]:
-        if len(entries) > 1:  # .strip().split("\n")
+        if entries and "equivalent_identifiers" in entries and entries["equivalent_identifiers"]:
+            for iden in entries["equivalent_identifiers"]:
+                if iden["identifier"].split(":")[0] + ":" == HGNC_NEW_PREFIX:
+                    norm_node = iden["identifier"]
+                    return norm_node
+    # Handle case where node normalizer returns nothing
+    except TypeError:
+        return None
+
+    else:
-    else:
+    url = NODE_NORMALIZER_URL + node_curie
+    try:
+        # Make the HTTP request to NodeNormalizer
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        # Write response to file if it contains data
+        entries = response.json()[node_curie]
+        if len(entries) > 1:  # .strip().split("\n")
+            for iden in entries["equivalent_identifiers"]:
+                if iden["identifier"].split(":")[0] + ":" == HGNC_NEW_PREFIX:
+                    norm_node = iden["identifier"]
+                    return norm_node
+        else:
+            return None
+    except requests.exceptions.RequestException as e:
+        logging.error(f"HTTP request failed for node_curie '{node_curie}' at URL '{url}': {e}")
+        return None
+    except (ValueError, KeyError, TypeError) as e:
+        logging.error(f"Error processing response for node_curie '{node_curie}' at URL '{url}': {e}")
-    else:
+    url = NODE_NORMALIZER_URL + node_curie
+    try:
+        # Make the HTTP request to NodeNormalizer
+        response = requests.get(url, timeout=30)
+        response.raise_for_status()
+        # Write response to file if it contains data
+        entries = response.json()[node_curie]
+        if len(entries) > 1:  # .strip().split("\n")
+            for iden in entries["equivalent_identifiers"]:
+                if iden["identifier"].split(":")[0] + ":" == HGNC_NEW_PREFIX:
+                    norm_node = iden["identifier"]
+                    return norm_node
+        else:
+            return None
+    except requests.exceptions.RequestException as e:
+        logging.error(f"HTTP request failed for node_curie '{node_curie}' at URL '{url}': {e}")
+        return None
+    except (ValueError, KeyError, TypeError) as e:
+        logging.error(f"Error processing response for node_curie '{node_curie}' at URL '{url}': {e}")
+        return None
 
 def is_float(entry):
     """Determine if value is float, returns True/False."""
@@ -213,7 +240,7 @@ def parse_disease(disease_entry, mondo_xref_dict):
     return mondo_list
 
 
-def parse_gene(gene_entry, mondo_gene_dict):
+def parse_gene(gene_entry, protein_entry):
     """
     Get gene ID from gene name entry.
 
@@ -226,8 +253,10 @@ def parse_gene(gene_entry, mondo_gene_dict):
     """
     gene_id = None
     if not is_float(gene_entry):
-        gene_name = gene_entry
-        gene_id = next((key for key, val in mondo_gene_dict.items() if val == gene_name), None)
+        # gene_name = gene_entry
+        # gene_id = next((key for key, val in mondo_gene_dict.items() if val == gene_name), None)
+        # if not gene_id:
+        gene_id = normalize_node_api(protein_entry)
 
     return gene_id
 
@@ -289,7 +318,6 @@ def get_nodes_and_edges(
     uniprot_df,
     go_category_trees_dictionary,
     mondo_xrefs_dict,
-    mondo_gene_dict,
     obsolete_terms_csv_file,
 ):
     """
@@ -321,6 +349,7 @@ def get_nodes_and_edges(
         parsed_columns += [
             DISEASE_PARSED_COLUMN,
             GENE_PRIMARY_PARSED_COLUMN,
+            GENE_NAME_PRIMARY_PARSED_COLUMN
         ]
     uniprot_parse_df = pd.DataFrame(columns=parsed_columns)
     uniprot_parse_df[ORGANISM_PARSED_COLUMN] = uniprot_df[UNIPROT_ORG_ID_COLUMN_NAME].apply(
@@ -349,10 +378,20 @@ def get_nodes_and_edges(
         uniprot_parse_df[DISEASE_PARSED_COLUMN] = uniprot_df[UNIPROT_DISEASE_COLUMN_NAME].apply(
             lambda x: parse_disease(x, mondo_xrefs_dict)
         )
+    # if UNIPROT_GENE_PRIMARY_COLUMN_NAME in uniprot_df.columns:
+    #     uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df[
+    #         UNIPROT_GENE_PRIMARY_COLUMN_NAME
+    #     ].apply(lambda x: parse_gene(x, mondo_gene_dict))
     if UNIPROT_GENE_PRIMARY_COLUMN_NAME in uniprot_df.columns:
-        uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df[
-            UNIPROT_GENE_PRIMARY_COLUMN_NAME
-        ].apply(lambda x: parse_gene(x, mondo_gene_dict))
+        uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df.apply(
+            lambda row: parse_gene(
+                    row[UNIPROT_GENE_PRIMARY_COLUMN_NAME], 
+                    UNIPROT_PREFIX + str(row[UNIPROT_PROTEIN_ID_COLUMN_NAME]
+                ) 
+                if pd.notna(row[UNIPROT_PROTEIN_ID_COLUMN_NAME]) 
+                else ""), 
+                axis=1
+        )
 
     for _, entry in uniprot_parse_df.iterrows():
         # Organism node
@@ -502,28 +541,28 @@ def prepare_mondo_dictionary():
             for row in csv_reader:
                 if OMIM_PREFIX in row["xref"]:
                     mondo_xrefs_dict[row["id"]] = row["xref"]
-    # Read MONDO nodes file for gene names
-    mondo_gene_dict = {}
-    if MONDO_GENE_IDS_FILEPATH.exists():
-        with open(MONDO_GENE_IDS_FILEPATH, "r") as file:
-            csv_reader = csv.DictReader(file, delimiter="\t")
-            for row in csv_reader:
-                mondo_gene_dict[row["id"]] = row["name"]
-    #! TODO: use oak
-    else:
-        mondo_nodes_file = (
-            Path(__file__).parents[2] / "data" / "transformed" / "ontologies" / "mondo_nodes.tsv"
-        )
-        if mondo_nodes_file.exists():
-            with open(mondo_nodes_file, "r") as file:
-                csv_reader = csv.DictReader(file, delimiter="\t")
-                for row in csv_reader:
-                    if HGNC_NEW_PREFIX in row["id"]:
-                        mondo_gene_dict[row["id"]] = row["name"]
-        mondo_gene_df = pd.DataFrame(list(mondo_gene_dict.items()), columns=["id", "name"])
-        mondo_gene_df.to_csv(MONDO_GENE_IDS_FILEPATH, sep="\t", index=False)
 
-    return mondo_xrefs_dict, mondo_gene_dict
+    return mondo_xrefs_dict
+    # # Read MONDO nodes file for gene names
+    # mondo_gene_dict = {}
+    # if MONDO_GENE_IDS_FILEPATH.exists():
+    #     with open(MONDO_GENE_IDS_FILEPATH, "r") as file:
+    #         csv_reader = csv.DictReader(file, delimiter="\t")
+    #         for row in csv_reader:
+    #             mondo_gene_dict[row["id"]] = row["name"]
+    # #! TODO: use oak
+    # else:
+    #     mondo_nodes_file = (
+    #         Path(__file__).parents[2] / "data" / "transformed" / "ontologies" / "mondo_nodes.tsv"
+    #     )
+    #     if mondo_nodes_file.exists():
+    #         with open(mondo_nodes_file, "r") as file:
+    #             csv_reader = csv.DictReader(file, delimiter="\t")
+    #             for row in csv_reader:
+    #                 if HGNC_NEW_PREFIX in row["id"]:
+    #                     mondo_gene_dict[row["id"]] = row["name"]
+    #     mondo_gene_df = pd.DataFrame(list(mondo_gene_dict.items()), columns=["id", "name"])
+    #     mondo_gene_df.to_csv(MONDO_GENE_IDS_FILEPATH, sep="\t", index=False)
 
 
 def process_lines(
@@ -537,7 +576,6 @@ def process_lines(
     progress_class,
     go_category_dictionary,
     mondo_xrefs_dict,
-    mondo_gene_dict,
     obsolete_terms_csv_file,
 ):
     """
@@ -566,7 +604,6 @@ def process_lines(
         df,
         go_category_dictionary,
         mondo_xrefs_dict,
-        mondo_gene_dict,
         obsolete_terms_csv_file,
     )
     # Write node and edge data to unique files
@@ -694,7 +731,6 @@ def create_pool(
     output_edge_file,
     go_category_trees_dict,
     mondo_xrefs_dict,
-    mondo_gene_dict,
     obsolete_terms_csv_file,
     uniprot_relevant_file_list,
     uniprot_tmp_ne_dir,
@@ -726,7 +762,6 @@ def create_pool(
                     progress_class,
                     go_category_trees_dict,
                     mondo_xrefs_dict,
-                    mondo_gene_dict,
                     obsolete_terms_csv_file,
                 )
                 for line_chunk in line_chunks