Skip to content
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -163,3 +163,5 @@ data/transformed/uniprot_genome_features/*.tsv
kg_microbe/transform_utils/uniprot/tmp/relevant_file_content.txt
kg_microbe/transform_utils/uniprot/tmp/nodes_and_edges/*
data/transformed/uniprot_genome_features/uniprot_kgx.zip
data/transformed/Uniref/edges.tsv
data/transformed/Uniref/nodes.tsv
2 changes: 2 additions & 0 deletions kg_microbe/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from kg_microbe.transform_utils.rhea.rhea import RheaMappingsTransform
from kg_microbe.transform_utils.traits.traits import TraitsTransform
from kg_microbe.transform_utils.uniprot.uniprot import UniprotTransform
from kg_microbe.transform_utils.uniref.uniref import UnirefTransform

DATA_SOURCES = {
# "DrugCentralTransform": DrugCentralTransform,
Expand All @@ -28,6 +29,7 @@
"RheaMappingsTransform": RheaMappingsTransform,
"BactoTraitsTransform": BactoTraitsTransform,
"UniprotTransform": UniprotTransform,
"UnirefTransform": UnirefTransform,
}


Expand Down
7 changes: 7 additions & 0 deletions kg_microbe/transform_utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,6 +123,8 @@
RHEA_OLD_PREFIX = "OBO:rhea_"
RHEA_NEW_PREFIX = "RHEA:"
ASSAY_PREFIX = "assay:"
UNIREF_90_PREFIX = "UniRef90:"

RHEA_URI = "http://purl.obolibrary.org/obo/rhea_"
DEBIO_OBO_PREFIX = "OBO:debio_"
DEBIO_NEW_PREFIX = "debio:"
Expand Down Expand Up @@ -163,6 +165,8 @@
NCBI_TO_SUBSTRATE_EDGE = "biolink:consumes"
RHEA_TO_EC_EDGE = "biolink:enabled_by"
RHEA_TO_GO_EDGE = "biolink:enables"
NCBI_TO_CLUSTER_EDGE = "biolink:occurs_in"


NCBI_CATEGORY = "biolink:OrganismTaxon"
MEDIUM_CATEGORY = "biolink:ChemicalEntity"
Expand All @@ -179,6 +183,7 @@
ATTRIBUTE_CATEGORY = "biolink:Attribute"
METABOLITE_CATEGORY = "biolink:ChemicalEntity"
SUBSTRATE_CATEGORY = "biolink:ChemicalEntity"
CLUSTER_CATEGORY = "biolink:ProteinFamily"

HAS_PART = "BFO:0000051"
IS_GROWN_IN = "BAO:0002924"
Expand All @@ -194,6 +199,7 @@
ASSESSED_ACTIVITY_RELATIONSHIP = "NCIT:C153110"
CLOSE_MATCH = "skos:closeMatch"
ASSOCIATED_WITH = "PATO:0001668"
OCCURS_IN = "BFO:0000066"

ID_COLUMN = "id"
NAME_COLUMN = "name"
Expand Down Expand Up @@ -410,6 +416,7 @@
GO_PREFIX,
MEDIADIVE_MEDIUM_PREFIX,
STRAIN_PREFIX,
UNIREF_90_PREFIX,
]

HAS_PARTICIPANT_PREDICATE = "biolink:has_participant"
Expand Down
5 changes: 5 additions & 0 deletions kg_microbe/transform_utils/uniref/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
"""Uniref transform."""

from .uniref import UnirefTransform

__all__ = ["UnirefTransform"]
115 changes: 115 additions & 0 deletions kg_microbe/transform_utils/uniref/uniref.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
"""UniRef Transformation Module."""

import csv
import gc
import os
import sys
from pathlib import Path
from typing import Optional, Union

from oaklib import get_adapter
from tqdm import tqdm

from kg_microbe.transform_utils.constants import (
CLUSTER_CATEGORY,
NCBI_CATEGORY,
NCBI_TO_CLUSTER_EDGE,
NCBITAXON_PREFIX,
OCCURS_IN,
UNIREF_90_PREFIX,
)
from kg_microbe.transform_utils.transform import Transform
from kg_microbe.utils.dummy_tqdm import DummyTqdm
from kg_microbe.utils.pandas_utils import drop_duplicates

csv.field_size_limit(sys.maxsize - 1) # _csv.Error: field larger than field limit (131072)


class UnirefTransform(Transform):

"""UniRef Transformation Class."""

def __init__(
self,
input_dir: Optional[Path] = None,
output_dir: Optional[Path] = None,
):
"""Instantiate part."""
source_name = "Uniref"
super().__init__(source_name, input_dir, output_dir)
self.ncbi_impl = get_adapter("sqlite:obo:ncbitaxon")

def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_status: bool = True):
"""Run the transformation."""
input_file = os.path.join(
self.input_base_dir, "uniref90_api_subset.tsv"
) # must exist already

progress_class = tqdm if show_status else DummyTqdm

with open(input_file, "r") as tsvfile, open(self.output_node_file, "w") as nodes_file, open(
self.output_edge_file, "w"
) as edges_file:
# Create a CSV reader specifying the delimiter as a tab character
tsvreader = csv.DictReader(tsvfile, delimiter="\t")
node_writer = csv.writer(nodes_file, delimiter="\t")
edge_writer = csv.writer(edges_file, delimiter="\t")
source = UNIREF_90_PREFIX.strip(":")

# Write the header for the files
node_writer.writerow(self.node_header)
edge_writer.writerow(self.edge_header)

with progress_class(desc="Processing clusters...") as progress:
# Iterate over each row in the TSV file
for row in tsvreader:
# Extract the desired fields
cluster_id = row["Cluster ID"].replace("_", ":")
cluster_name = row["Cluster Name"].lstrip("Cluster:").strip()
ncbitaxon_ids = [
NCBITAXON_PREFIX + x.strip() for x in row["Organism IDs"].split(";") if x
]
ncbi_labels = [
ncbi_label.strip()
for ncbi_label in row["Organisms"].split(";")
if ncbi_label
]
nodes_data_to_write = [
[ncbitaxon_id, NCBI_CATEGORY, ncbi_label]
for ncbitaxon_id, ncbi_label in zip(
ncbitaxon_ids, ncbi_labels, strict=False
)
]
# nodes_data_to_write.append([cluster_id, CLUSTER_CATEGORY, cluster_name])
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Remove commented-out code. The cluster node creation is handled in lines 92-94, making this commented line redundant.

Suggested change
# nodes_data_to_write.append([cluster_id, CLUSTER_CATEGORY, cluster_name])

Copilot uses AI. Check for mistakes.
nodes_data_to_write = [
sublist + [None] * (len(self.node_header) - 3)
for sublist in nodes_data_to_write
]
node_writer.writerows(nodes_data_to_write)
gc.collect()
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Manual garbage collection after each row processing may hurt performance more than help. Consider removing this call or moving it to process fewer records (e.g., every 1000 rows).

Suggested change
gc.collect()
# Call gc.collect() every 1000 rows
if row_count % 1000 == 0:
gc.collect()

Copilot uses AI. Check for mistakes.

# Write the cluster node
cluster_node_data = [cluster_id, CLUSTER_CATEGORY, cluster_name]
cluster_node_data.extend([None] * (len(self.node_header) - 3))
node_writer.writerow(cluster_node_data)

# Write the edge for the cluster
edges_data_to_write = [
[
ncbitaxon_id,
NCBI_TO_CLUSTER_EDGE,
cluster_id,
OCCURS_IN,
source,
]
for ncbitaxon_id in ncbitaxon_ids
]
edge_writer.writerows(edges_data_to_write)
gc.collect()
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Manual garbage collection after each row processing may hurt performance more than help. Consider removing this call or moving it to process fewer records (e.g., every 1000 rows).

Suggested change
gc.collect()

Copilot uses AI. Check for mistakes.

progress.set_description(f"Processing Cluster: {cluster_id}")
# After each iteration, call the update method to advance the progress bar.
progress.update(2000)
Copy link

Copilot AI Aug 13, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The magic number 2000 for progress updates is unclear. Consider making this a named constant or comment explaining why this specific value is used.

Suggested change
progress.update(2000)
progress.update(PROGRESS_UPDATE_INTERVAL)

Copilot uses AI. Check for mistakes.

drop_duplicates(self.output_node_file)
drop_duplicates(self.output_edge_file)
7 changes: 7 additions & 0 deletions merge.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -105,6 +105,13 @@ merged_graph:
filename:
- data/transformed/uniprot_genome_features/nodes.tsv
- data/transformed/uniprot_genome_features/edges.tsv
uniref:
input:
name: "Uniref"
format: tsv
filename:
- data/transformed/Uniref/nodes.tsv
- data/transformed/Uniref/edges.tsv
operations:
- name: kgx.graph_operations.summarize_graph.generate_graph_stats
args:
Expand Down