-
Notifications
You must be signed in to change notification settings - Fork 4
UniRef90 transform #170
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
UniRef90 transform #170
Changes from all commits
ae9005c
8504631
3008a68
4a696c8
0d08846
142ffc0
79b03db
8888377
511cdb4
59a7c2c
6387605
2938f86
9a59148
75116ea
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,5 @@ | ||
| """Uniref transform.""" | ||
|
|
||
| from .uniref import UnirefTransform | ||
|
|
||
| __all__ = ["UnirefTransform"] |
| Original file line number | Diff line number | Diff line change | ||||||||
|---|---|---|---|---|---|---|---|---|---|---|
| @@ -0,0 +1,115 @@ | ||||||||||
| """UniRef Transformation Module.""" | ||||||||||
|
|
||||||||||
| import csv | ||||||||||
| import gc | ||||||||||
| import os | ||||||||||
| import sys | ||||||||||
| from pathlib import Path | ||||||||||
| from typing import Optional, Union | ||||||||||
|
|
||||||||||
| from oaklib import get_adapter | ||||||||||
| from tqdm import tqdm | ||||||||||
|
|
||||||||||
| from kg_microbe.transform_utils.constants import ( | ||||||||||
| CLUSTER_CATEGORY, | ||||||||||
| NCBI_CATEGORY, | ||||||||||
| NCBI_TO_CLUSTER_EDGE, | ||||||||||
| NCBITAXON_PREFIX, | ||||||||||
| OCCURS_IN, | ||||||||||
| UNIREF_90_PREFIX, | ||||||||||
| ) | ||||||||||
| from kg_microbe.transform_utils.transform import Transform | ||||||||||
| from kg_microbe.utils.dummy_tqdm import DummyTqdm | ||||||||||
| from kg_microbe.utils.pandas_utils import drop_duplicates | ||||||||||
|
|
||||||||||
| csv.field_size_limit(sys.maxsize - 1) # _csv.Error: field larger than field limit (131072) | ||||||||||
|
|
||||||||||
|
|
||||||||||
| class UnirefTransform(Transform): | ||||||||||
|
|
||||||||||
| """UniRef Transformation Class.""" | ||||||||||
|
|
||||||||||
| def __init__( | ||||||||||
| self, | ||||||||||
| input_dir: Optional[Path] = None, | ||||||||||
| output_dir: Optional[Path] = None, | ||||||||||
| ): | ||||||||||
| """Instantiate part.""" | ||||||||||
| source_name = "Uniref" | ||||||||||
| super().__init__(source_name, input_dir, output_dir) | ||||||||||
| self.ncbi_impl = get_adapter("sqlite:obo:ncbitaxon") | ||||||||||
|
|
||||||||||
| def run(self, data_file: Union[Optional[Path], Optional[str]] = None, show_status: bool = True): | ||||||||||
| """Run the transformation.""" | ||||||||||
| input_file = os.path.join( | ||||||||||
| self.input_base_dir, "uniref90_api_subset.tsv" | ||||||||||
| ) # must exist already | ||||||||||
|
|
||||||||||
| progress_class = tqdm if show_status else DummyTqdm | ||||||||||
|
|
||||||||||
| with open(input_file, "r") as tsvfile, open(self.output_node_file, "w") as nodes_file, open( | ||||||||||
| self.output_edge_file, "w" | ||||||||||
| ) as edges_file: | ||||||||||
| # Create a CSV reader specifying the delimiter as a tab character | ||||||||||
| tsvreader = csv.DictReader(tsvfile, delimiter="\t") | ||||||||||
| node_writer = csv.writer(nodes_file, delimiter="\t") | ||||||||||
| edge_writer = csv.writer(edges_file, delimiter="\t") | ||||||||||
| source = UNIREF_90_PREFIX.strip(":") | ||||||||||
|
|
||||||||||
| # Write the header for the files | ||||||||||
| node_writer.writerow(self.node_header) | ||||||||||
| edge_writer.writerow(self.edge_header) | ||||||||||
|
|
||||||||||
| with progress_class(desc="Processing clusters...") as progress: | ||||||||||
| # Iterate over each row in the TSV file | ||||||||||
| for row in tsvreader: | ||||||||||
| # Extract the desired fields | ||||||||||
| cluster_id = row["Cluster ID"].replace("_", ":") | ||||||||||
| cluster_name = row["Cluster Name"].lstrip("Cluster:").strip() | ||||||||||
| ncbitaxon_ids = [ | ||||||||||
| NCBITAXON_PREFIX + x.strip() for x in row["Organism IDs"].split(";") if x | ||||||||||
| ] | ||||||||||
| ncbi_labels = [ | ||||||||||
| ncbi_label.strip() | ||||||||||
| for ncbi_label in row["Organisms"].split(";") | ||||||||||
| if ncbi_label | ||||||||||
| ] | ||||||||||
| nodes_data_to_write = [ | ||||||||||
| [ncbitaxon_id, NCBI_CATEGORY, ncbi_label] | ||||||||||
| for ncbitaxon_id, ncbi_label in zip( | ||||||||||
| ncbitaxon_ids, ncbi_labels, strict=False | ||||||||||
| ) | ||||||||||
| ] | ||||||||||
| # nodes_data_to_write.append([cluster_id, CLUSTER_CATEGORY, cluster_name]) | ||||||||||
| nodes_data_to_write = [ | ||||||||||
| sublist + [None] * (len(self.node_header) - 3) | ||||||||||
| for sublist in nodes_data_to_write | ||||||||||
| ] | ||||||||||
| node_writer.writerows(nodes_data_to_write) | ||||||||||
| gc.collect() | ||||||||||
|
||||||||||
| gc.collect() | |
| # Call gc.collect() every 1000 rows | |
| if row_count % 1000 == 0: | |
| gc.collect() |
Copilot
AI
Aug 13, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Manual garbage collection after each row processing may hurt performance more than help. Consider removing this call or moving it to process fewer records (e.g., every 1000 rows).
| gc.collect() |
Copilot
AI
Aug 13, 2025
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The magic number 2000 for progress updates is unclear. Consider making this a named constant or comment explaining why this specific value is used.
| progress.update(2000) | |
| progress.update(PROGRESS_UPDATE_INTERVAL) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Remove commented-out code. The cluster node creation is handled in lines 92-94, making this commented line redundant.