Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -596,6 +596,10 @@
]

HGNC_OLD_PREFIX = "http://identifiers.org/hgnc/"
HGNC_GENENAMES_PREFIX = "https://www.genenames.org/data/gene-symbol-report/#!/hgnc_id/HGNC:"
HGNC_GENETYPE_PREFIX = "http://ontology.scibite.com/ontology/hgnc/SHGNC_"
HGNC_GENEGROUP_PREFIX = "https://www.genenames.org/data/genegroup/#!/group/"
HGNC_GENEPROPERTY_PREFIX = "http://ontology.scibite.com/property/"
HGNC_NEW_PREFIX = "HGNC:"

# Create a mapping for special cases
Expand All @@ -611,6 +615,9 @@
UNIPATHWAYS_REACTION_PREFIX: re.sub(r"OBO:UPa_(\w{3})", r"UPA:\1", UNIPATHWAYS_REACTION_PREFIX),
UNIPATHWAYS_PATHWAY_PREFIX: re.sub(r"OBO:UPa_(\w{3})", r"UPA:\1", UNIPATHWAYS_PATHWAY_PREFIX),
HGNC_OLD_PREFIX: HGNC_NEW_PREFIX,
HGNC_GENENAMES_PREFIX: HGNC_NEW_PREFIX,
HGNC_GENETYPE_PREFIX: HGNC_NEW_PREFIX,
HGNC_GENEGROUP_PREFIX: HGNC_NEW_PREFIX,
}

# CTD
Expand All @@ -620,7 +627,7 @@
CTD_DISEASE_OMIM_COLUMN = "OmimIDs"
CHEMICAL_TO_DISEASE_EDGE = "biolink:associated_with"
MESH_PREFIX = "MESH:"
NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.4/get_normalized_nodes?curie="
NODE_NORMALIZER_URL = "https://nodenormalization-sri.renci.org/1.5/get_normalized_nodes?curie="
MONDO_PREFIX = "MONDO:"

# Disbiome
Expand Down
34 changes: 19 additions & 15 deletions kg_microbe/transform_utils/ontologies/ontologies_transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
ENABLED_BY_RELATION,
EXCLUSION_TERMS_FILE,
GO_PREFIX,
HGNC_GENEPROPERTY_PREFIX,
ID_COLUMN,
MONDO_XREFS_FILEPATH,
NCBITAXON_PREFIX,
Expand Down Expand Up @@ -67,15 +68,16 @@
from ..transform import Transform

ONTOLOGIES_MAP = {
"ncbitaxon": "ncbitaxon.owl.gz",
"chebi": "chebi.owl.gz",
"envo": "envo.json",
"go": "go.json",
## "rhea": "rhea.json.gz", # Redundant to RheaMappingsTransform
# "ncbitaxon": "ncbitaxon.owl.gz",
# "chebi": "chebi.owl.gz",
# "envo": "envo.json",
# "go": "go.json",
# ## "rhea": "rhea.json.gz", # Redundant to RheaMappingsTransform
"ec": "ec.json",
"upa": "upa.owl",
"mondo": "mondo.json",
"hp": "hp.json",
# "upa": "upa.owl",
# "mondo": "mondo.json",
# "hp": "hp.json",
"hgnc": "hgnc.owl"
}


Expand Down Expand Up @@ -175,7 +177,7 @@ def parse(self, name: str, data_file: Optional[Path], source: str) -> None:
output=self.output_dir / name,
output_format="tsv",
)
if name in ["ec", "upa", "chebi", "mondo"]: # removed "uniprot", "rhea"
if name in ["ec", "upa", "chebi", "mondo", "hgnc"]: # removed "uniprot", "rhea"

self.post_process(name)

Expand Down Expand Up @@ -431,7 +433,7 @@ def _replace_quotation_marks(line, description_index):
for line in new_edge_lines:
new_ef.write(line)

if name == "ec": # or name == "rhea":
if name == "ec" or name == "hgnc": # or name == "rhea":
with open(nodes_file, "r") as nf, open(edges_file, "r") as ef:
# Update prefixes in nodes file
new_nf_lines = []
Expand All @@ -449,15 +451,17 @@ def _replace_quotation_marks(line, description_index):
# Update prefixes in edges file
new_ef_lines = []
for line in ef:
if line.startswith("id"):
continue
else:
if not line.startswith("id"):
line = _replace_special_prefixes(line)
new_ef_lines.append(line)
new_ef_lines.append(line)
if name == "ec":
# Remove Uniprot nodes since accounted for elsewhere
new_nf_lines = [line for line in new_nf_lines if UNIPROT_PREFIX not in line]
new_ef_lines = [line for line in new_ef_lines if UNIPROT_PREFIX not in line]
elif name == "hgnc":
# Remove Property nodes
new_nf_lines = [line for line in new_nf_lines if HGNC_GENEPROPERTY_PREFIX not in line]
new_ef_lines = [line for line in new_ef_lines if HGNC_GENEPROPERTY_PREFIX not in line]
# elif name == "rhea":
# # Remove debio nodes that account for direction, since already there in inverse triples
# # Note that CHEBI and EC predicates do not match Rhea pyobo, so removing them
Expand All @@ -475,7 +479,7 @@ def _replace_quotation_marks(line, description_index):

# Rewrite edges file
with open(edges_file, "w") as new_ef:
new_ef.write("\t".join(self.edge_header) + "\n")
# new_ef.write("\t".join(self.edge_header) + "\n")
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Commented-out code should be removed rather than left in the codebase. If this header writing logic is needed, it should be uncommented and used, otherwise it should be deleted.

Suggested change
# new_ef.write("\t".join(self.edge_header) + "\n")

Copilot uses AI. Check for mistakes.
for line in new_ef_lines:
new_ef.write(line)

Expand Down
3 changes: 1 addition & 2 deletions kg_microbe/transform_utils/uniprot_human/uniprot_human.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
os.makedirs(UNIPROT_HUMAN_TMP_DIR, exist_ok=True)
os.makedirs(UNIPROT_HUMAN_TMP_NE_DIR, exist_ok=True)
go_category_trees_dict = prepare_go_dictionary()
mondo_xrefs_dict, mondo_gene_dict = prepare_mondo_dictionary()
mondo_xrefs_dict = prepare_mondo_dictionary()

# make directory in data/transformed
os.makedirs(self.output_dir, exist_ok=True)
Expand All @@ -82,7 +82,6 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu
self.output_edge_file,
go_category_trees_dict,
mondo_xrefs_dict,
mondo_gene_dict,
OBSOLETE_TERMS_CSV_FILE,
UNIPROT_HUMAN_RELEVANT_FILE_LIST,
UNIPROT_HUMAN_TMP_NE_DIR,
Expand Down
2 changes: 1 addition & 1 deletion kg_microbe/transform_utils/wallen_etal/wallen_etal.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,7 @@ def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_statu

wallen_etal_df = pd.read_excel(input_file, skiprows=3, sheet_name=WALLEN_ETAL_TAB_NAME)
wallen_etal_df[FDR_COLUMN] = pd.to_numeric(wallen_etal_df[FDR_COLUMN], errors="coerce")

import pdb;pdb.set_trace()
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Debug statement with pdb.set_trace() should not be committed to production code. This will cause the program to halt execution and wait for debugger input.

Suggested change
import pdb;pdb.set_trace()

Copilot uses AI. Check for mistakes.
significant_wallenetal_df = wallen_etal_df[
wallen_etal_df[FDR_COLUMN].apply(lambda x: isinstance(x, float))
]
Expand Down
99 changes: 67 additions & 32 deletions kg_microbe/utils/uniprot_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from pathlib import Path

import pandas as pd
import requests
from tqdm import tqdm

from kg_microbe.transform_utils.constants import (
Expand All @@ -20,6 +21,7 @@
EC_CATEGORY,
EC_PREFIX,
ENABLES,
GENE_CATEGORY,
GENE_TO_PROTEIN_EDGE,
GO_BIOLOGICAL_PROCESS_ID,
GO_BIOLOGICAL_PROCESS_LABEL,
Expand All @@ -37,6 +39,7 @@
MONDO_XREFS_FILEPATH,
NCBI_CATEGORY,
NCBITAXON_PREFIX,
NODE_NORMALIZER_URL,
OMIM_PREFIX,
ONTOLOGIES_TREES_DIR,
PARTICIPATES_IN,
Expand Down Expand Up @@ -98,6 +101,7 @@
RHEA_PARSED_COLUMN = "rhea_parsed"
DISEASE_PARSED_COLUMN = "disease_parsed"
GENE_PRIMARY_PARSED_COLUMN = "gene_primary_parsed"
GENE_NAME_PRIMARY_PARSED_COLUMN = "gene_name_primary_parsed"
GO_TERM_COLUMN = "GO_Term"
GO_CATEGORY_COLUMN = "GO_Category"
UNIPROT_ID_COLUMN = "Uniprot_ID"
Expand All @@ -106,6 +110,29 @@
CHEBI_REGEX = re.compile(r'/ligand_id="ChEBI:(.*?)";')
GO_REGEX = re.compile(r"\[(.*?)\]")

# Takes cure in the form PREFIX:ID
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Typo in comment: 'cure' should be 'curie'.

Suggested change
# Takes cure in the form PREFIX:ID
# Takes curie in the form PREFIX:ID

Copilot uses AI. Check for mistakes.
def normalize_node_api(node_curie):

url = NODE_NORMALIZER_URL + node_curie

# Make the HTTP request to NodeNormalizer
response = requests.get(url, timeout=30)
response.raise_for_status()

# Write response to file if it contains data
entries = response.json()[node_curie]
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line assumes the response JSON contains the node_curie key, but this could fail if the API response structure is different or if the key doesn't exist, potentially causing a KeyError.

Suggested change
entries = response.json()[node_curie]
entries = response.json().get(node_curie)
if entries is None:
return None

Copilot uses AI. Check for mistakes.
try:
if len(entries) > 1: # .strip().split("\n")
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The condition len(entries) > 1 doesn't make logical sense here. The code is checking if entries has more than one element, but then accesses entries["equivalent_identifiers"], suggesting entries is a dictionary. The logic should verify entries is not None and contains the expected structure.

Suggested change
if len(entries) > 1: # .strip().split("\n")
if entries and "equivalent_identifiers" in entries and entries["equivalent_identifiers"]:

Copilot uses AI. Check for mistakes.
for iden in entries["equivalent_identifiers"]:
if iden["identifier"].split(":")[0] + ":" == HGNC_NEW_PREFIX:
norm_node = iden["identifier"]
return norm_node
# Handle case where node normalizer returns nothing
except TypeError:
return None

else:
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function lacks proper error handling for HTTP requests. If the API is unavailable or returns an error status, the function will raise an exception without providing a meaningful error message to help with debugging.

Suggested change
else:
url = NODE_NORMALIZER_URL + node_curie
try:
# Make the HTTP request to NodeNormalizer
response = requests.get(url, timeout=30)
response.raise_for_status()
# Write response to file if it contains data
entries = response.json()[node_curie]
if len(entries) > 1: # .strip().split("\n")
for iden in entries["equivalent_identifiers"]:
if iden["identifier"].split(":")[0] + ":" == HGNC_NEW_PREFIX:
norm_node = iden["identifier"]
return norm_node
else:
return None
except requests.exceptions.RequestException as e:
logging.error(f"HTTP request failed for node_curie '{node_curie}' at URL '{url}': {e}")
return None
except (ValueError, KeyError, TypeError) as e:
logging.error(f"Error processing response for node_curie '{node_curie}' at URL '{url}': {e}")

Copilot uses AI. Check for mistakes.
return None

def is_float(entry):
"""Determine if value is float, returns True/False."""
Expand Down Expand Up @@ -213,7 +240,7 @@ def parse_disease(disease_entry, mondo_xref_dict):
return mondo_list


def parse_gene(gene_entry, mondo_gene_dict):
def parse_gene(gene_entry, protein_entry):
"""
Get gene ID from gene name entry.

Expand All @@ -226,8 +253,10 @@ def parse_gene(gene_entry, mondo_gene_dict):
"""
gene_id = None
if not is_float(gene_entry):
gene_name = gene_entry
gene_id = next((key for key, val in mondo_gene_dict.items() if val == gene_name), None)
# gene_name = gene_entry
# gene_id = next((key for key, val in mondo_gene_dict.items() if val == gene_name), None)
# if not gene_id:
gene_id = normalize_node_api(protein_entry)

return gene_id

Expand Down Expand Up @@ -289,7 +318,6 @@ def get_nodes_and_edges(
uniprot_df,
go_category_trees_dictionary,
mondo_xrefs_dict,
mondo_gene_dict,
obsolete_terms_csv_file,
):
"""
Expand Down Expand Up @@ -321,6 +349,7 @@ def get_nodes_and_edges(
parsed_columns += [
DISEASE_PARSED_COLUMN,
GENE_PRIMARY_PARSED_COLUMN,
GENE_NAME_PRIMARY_PARSED_COLUMN
]
uniprot_parse_df = pd.DataFrame(columns=parsed_columns)
uniprot_parse_df[ORGANISM_PARSED_COLUMN] = uniprot_df[UNIPROT_ORG_ID_COLUMN_NAME].apply(
Expand Down Expand Up @@ -349,10 +378,20 @@ def get_nodes_and_edges(
uniprot_parse_df[DISEASE_PARSED_COLUMN] = uniprot_df[UNIPROT_DISEASE_COLUMN_NAME].apply(
lambda x: parse_disease(x, mondo_xrefs_dict)
)
# if UNIPROT_GENE_PRIMARY_COLUMN_NAME in uniprot_df.columns:
# uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df[
# UNIPROT_GENE_PRIMARY_COLUMN_NAME
# ].apply(lambda x: parse_gene(x, mondo_gene_dict))
if UNIPROT_GENE_PRIMARY_COLUMN_NAME in uniprot_df.columns:
uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df[
UNIPROT_GENE_PRIMARY_COLUMN_NAME
].apply(lambda x: parse_gene(x, mondo_gene_dict))
uniprot_parse_df[GENE_PRIMARY_PARSED_COLUMN] = uniprot_df.apply(
lambda row: parse_gene(
row[UNIPROT_GENE_PRIMARY_COLUMN_NAME],
UNIPROT_PREFIX + str(row[UNIPROT_PROTEIN_ID_COLUMN_NAME]
)
if pd.notna(row[UNIPROT_PROTEIN_ID_COLUMN_NAME])
else ""),
axis=1
)

for _, entry in uniprot_parse_df.iterrows():
# Organism node
Expand Down Expand Up @@ -502,28 +541,28 @@ def prepare_mondo_dictionary():
for row in csv_reader:
if OMIM_PREFIX in row["xref"]:
mondo_xrefs_dict[row["id"]] = row["xref"]
# Read MONDO nodes file for gene names
mondo_gene_dict = {}
if MONDO_GENE_IDS_FILEPATH.exists():
with open(MONDO_GENE_IDS_FILEPATH, "r") as file:
csv_reader = csv.DictReader(file, delimiter="\t")
for row in csv_reader:
mondo_gene_dict[row["id"]] = row["name"]
#! TODO: use oak
else:
mondo_nodes_file = (
Path(__file__).parents[2] / "data" / "transformed" / "ontologies" / "mondo_nodes.tsv"
)
if mondo_nodes_file.exists():
with open(mondo_nodes_file, "r") as file:
csv_reader = csv.DictReader(file, delimiter="\t")
for row in csv_reader:
if HGNC_NEW_PREFIX in row["id"]:
mondo_gene_dict[row["id"]] = row["name"]
mondo_gene_df = pd.DataFrame(list(mondo_gene_dict.items()), columns=["id", "name"])
mondo_gene_df.to_csv(MONDO_GENE_IDS_FILEPATH, sep="\t", index=False)

return mondo_xrefs_dict, mondo_gene_dict
return mondo_xrefs_dict
# # Read MONDO nodes file for gene names
# mondo_gene_dict = {}
# if MONDO_GENE_IDS_FILEPATH.exists():
# with open(MONDO_GENE_IDS_FILEPATH, "r") as file:
# csv_reader = csv.DictReader(file, delimiter="\t")
# for row in csv_reader:
# mondo_gene_dict[row["id"]] = row["name"]
# #! TODO: use oak
# else:
# mondo_nodes_file = (
# Path(__file__).parents[2] / "data" / "transformed" / "ontologies" / "mondo_nodes.tsv"
# )
# if mondo_nodes_file.exists():
# with open(mondo_nodes_file, "r") as file:
# csv_reader = csv.DictReader(file, delimiter="\t")
# for row in csv_reader:
# if HGNC_NEW_PREFIX in row["id"]:
# mondo_gene_dict[row["id"]] = row["name"]
# mondo_gene_df = pd.DataFrame(list(mondo_gene_dict.items()), columns=["id", "name"])
# mondo_gene_df.to_csv(MONDO_GENE_IDS_FILEPATH, sep="\t", index=False)


def process_lines(
Expand All @@ -537,7 +576,6 @@ def process_lines(
progress_class,
go_category_dictionary,
mondo_xrefs_dict,
mondo_gene_dict,
obsolete_terms_csv_file,
):
"""
Expand Down Expand Up @@ -566,7 +604,6 @@ def process_lines(
df,
go_category_dictionary,
mondo_xrefs_dict,
mondo_gene_dict,
obsolete_terms_csv_file,
)
# Write node and edge data to unique files
Expand Down Expand Up @@ -694,7 +731,6 @@ def create_pool(
output_edge_file,
go_category_trees_dict,
mondo_xrefs_dict,
mondo_gene_dict,
obsolete_terms_csv_file,
uniprot_relevant_file_list,
uniprot_tmp_ne_dir,
Expand Down Expand Up @@ -726,7 +762,6 @@ def create_pool(
progress_class,
go_category_trees_dict,
mondo_xrefs_dict,
mondo_gene_dict,
obsolete_terms_csv_file,
)
for line_chunk in line_chunks
Expand Down
Loading