Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion pyaptamer/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,17 @@
)
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
from pyaptamer.datasets._loaders._load_aptamer import (
load_aptadb,
load_encoders,
)
from pyaptamer.datasets._loaders._one_gnh import load_1gnh, load_1gnh_structure
from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
from pyaptamer.datasets._loaders._pfoa import load_pfoa, load_pfoa_structure

__all__ = [
"load_aptacom_full",
"load_aptacom_xy",
"load_aptacom_x_y",
"load_csv_dataset",
"load_hf_dataset",
"load_pfoa",
Expand All @@ -21,4 +25,6 @@
"load_1gnh_structure",
"load_from_rcsb",
"load_csv_dataset",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

should not be removed

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have added everything back.

"load_aptadb",
"load_encoders",
]
8 changes: 7 additions & 1 deletion pyaptamer/datasets/_loaders/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,23 @@
)
from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
from pyaptamer.datasets._loaders._load_aptamer import (
load_aptadb,
load_encoders,
)
from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure

__all__ = [
"load_pfoa_structure",
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

it looks like you are deleting some experts here that are unrelated to your PR; can you please revert those changes?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please check now

"load_1gnh_structure",
"load_aptacom_full",
"load_aptacom_xy",
"load_aptacom_x_y",
"load_csv_dataset",
"load_hf_dataset",
"load_pfoa_structure",
"load_1gnh",
"load_1gnh_structure",
"load_aptadb",
"load_encoders",
]
218 changes: 218 additions & 0 deletions pyaptamer/datasets/_loaders/_load_aptamer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,218 @@
__author__ = "Satarupa22-SD"
__all__ = ["load_aptadb", "load_encoders"]

from pathlib import Path

import pandas as pd


def _download_dataset(
dataset_name: str,
target_dir: Path,
force_download: bool = False,
) -> None:
"""Download a Kaggle dataset to the specified directory and unzip it."""
import kaggle # avoid import-time auth

target_dir.mkdir(parents=True, exist_ok=True)

# Only download if forced or no CSV files exist
if force_download or not any(target_dir.glob("*.csv")):
kaggle.api.dataset_download_files(
dataset_name,
path=str(target_dir),
unzip=True,
)


def load_encoders(
path: str | Path,
*,
encoding: str | None = None,
**read_csv_kwargs,
) -> pd.DataFrame:
"""
Load a CSV file into a pandas DataFrame with multi-encoding fallback.

This function attempts to read a CSV file using a list of common text
encodings, ensuring robust loading even when the file has ambiguous or
non-UTF-8 formatting. When ``encoding`` is explicitly provided, only that
encoding is used. Otherwise, the function tries the following encodings
in order:

``["utf-8", "utf-8-sig", "latin-1", "cp1252", "windows-1252"]``

Parameters
----------
path : str or pathlib.Path
Path to the CSV file to load. The file must exist locally.
encoding : str, optional
Specific encoding to use for reading the CSV. If ``None`` (default),
multiple encodings are tried sequentially until one succeeds.
**read_csv_kwargs
Additional keyword arguments passed directly to ``pandas.read_csv``.
Useful for specifying delimiters, NA values, column types, etc.

Returns
-------
pandas.DataFrame
A DataFrame containing the parsed CSV data.

Raises
------
RuntimeError
If the file cannot be read with any of the attempted encodings.
FileNotFoundError
If the given path does not point to an existing file.

Examples
--------
>>> df = load_encoders("aptamer_interactions.csv")
>>> df.head()
"""
candidate_encodings = (
[
"utf-8",
"utf-8-sig",
"latin-1",
"cp1252",
"windows-1252",
]
if encoding is None
else [encoding]
)

last_error: Exception | None = None

for enc in candidate_encodings:
try:
return pd.read_csv(path, encoding=enc, **read_csv_kwargs)
except Exception as e: # pragma: no cover - exercised via fallback
last_error = e
continue

raise RuntimeError(
f"Failed to read CSV {path} with encodings {candidate_encodings}: {last_error}"
)


def load_aptadb(
dataset_name: str = "satarupadeb/aptamer-interactions",
cache_dir: str | Path | None = None,
force_download: bool = False,
*,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the * used for?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's just a separator

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do we need it? Is it something done as a "best practice"?

encoding: str | None = None,
**kwargs,
) -> pd.DataFrame:
"""
Download (if needed) and load the aptamer interaction dataset from Kaggle.

This function retrieves the dataset from Kaggle using the Kaggle API,
caches it locally, and loads the expected CSV file
``aptamer_interactions.csv`` into a pandas DataFrame.

If the CSV already exists in the cache directory, it is used directly
unless ``force_download=True`` is provided.

Parameters
----------
dataset_name : str, optional
The Kaggle dataset identifier, formatted as
``"username/dataset-name"``.
Default is ``"satarupadeb/aptamer-interactions"``.
cache_dir : str or pathlib.Path, optional
Directory where the dataset will be downloaded and cached.
If ``None`` (default), the cache is stored under
``~/.pyaptamer/cache/<dataset_name>/``.
force_download : bool, default False
If ``True``, the dataset is downloaded even if a cached CSV already
exists.
encoding : str, optional
Encoding to pass to the CSV loader. If ``None``, multiple encodings
are attempted by ``load_encoders``.
**kwargs
Additional keyword arguments passed to ``load_encoders`` and
ultimately to ``pandas.read_csv``.

Returns
-------
pandas.DataFrame
The loaded aptamer interactions dataset. Typical columns include:

- ``aptamer_id``
- ``target_id``
- ``aptamer_sequence``
- ``target_name``
- ``target_uniprot``
- ``organism``
- ``ligand_type``
- ``binding_conditions``
- ``reference_pubmed_id``
- ``interaction_present``

Raises
------
ImportError
If the ``kaggle`` Python package is not installed.
RuntimeError
If the dataset download fails.
FileNotFoundError
If ``aptamer_interactions.csv`` is not present after download.

To Be Noted
-----
You must have Kaggle API credentials configured before using this
function, by setting the ``KAGGLE_USERNAME`` and ``KAGGLE_KEY`` environment
variables.

Examples
--------
Set the required environment variables in Python:
>>> import os
>>> os.environ["KAGGLE_USERNAME"] = (
... "yourkaggleusername" # Replace with your username
... )
>>> os.environ["KAGGLE_KEY"] = "yourkaggleapi" # Replace with your Kaggle API key


Then load the dataset:

>>> from pyaptamer.datasets import load_aptadb
>>> df = load_aptadb()
>>> print(df.head())
>>> df.shape
(1234, 10)

>>> df = load_aptadb(cache_dir="data/cache", force_download=True)
>>> df.columns
Index([...], dtype='object')
"""
if cache_dir is None:
cache_dir = (
Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
)
else:
cache_dir = Path(cache_dir)

# The expected CSV filename in the Kaggle dataset
csv_file = cache_dir / "aptamer_interactions.csv"

if not csv_file.exists() or force_download:
try:
_download_dataset(dataset_name, cache_dir, force_download=force_download)
except ImportError as err:
raise ImportError(
"The 'kaggle' package is required to download datasets. "
"Install it with: pip install kaggle"
) from err
except Exception as e:
raise RuntimeError(
f"Failed to download dataset '{dataset_name}' from Kaggle: {e}"
) from e

if not csv_file.exists():
raise FileNotFoundError(
f"Expected file 'aptamer_interactions.csv' not found at {cache_dir}"
)

return load_encoders(path=str(csv_file), encoding=encoding, **kwargs)
123 changes: 123 additions & 0 deletions pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
__author__ = "Satarupa22-SD"

from unittest.mock import patch

import pandas as pd
import pytest

from pyaptamer.datasets import load_aptadb
from pyaptamer.datasets._loaders._load_aptamer import load_encoders


def test_local_csv(tmp_path):
"""Test loading aptamer data from a local CSV file."""
csv_path = tmp_path / "aptadb_sample.csv"
pd.DataFrame(
{
"aptamer_id": ["APT001"],
"aptamer_sequence": ["AUGCUU"],
"target_name": ["Thrombin"],
"interaction_present": ["1"],
}
).to_csv(csv_path, index=False)

df = load_encoders(csv_path)
assert isinstance(df, pd.DataFrame)
assert not df.empty
assert df.loc[0, "aptamer_sequence"] == "AUGCUU"


def test_uses_cache(tmp_path):
"""Test that cached data is used instead of downloading."""
csv_path = tmp_path / "aptamer_interactions.csv"
pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(
csv_path, index=False
)

df = load_aptadb(cache_dir=tmp_path)
assert not df.empty
assert df.loc[0, "aptamer_sequence"] == "AUGU"


def test_requires_kaggle(tmp_path):
"""Test that ImportError is raised when kaggle package is missing."""
# Ensure no CSV present so a download would be attempted
with patch.dict("sys.modules", {"kaggle": None}):
with pytest.raises(ImportError):
load_aptadb(cache_dir=tmp_path)


def test_invalid_dataset(tmp_path):
"""Test error handling for invalid dataset download."""
# Force the download to fail
with patch(
"pyaptamer.datasets._loaders._load_aptamer._download_dataset",
side_effect=Exception("boom"),
):
with pytest.raises(
RuntimeError, match=r"Failed to download dataset .* from Kaggle"
):
load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path)


@pytest.fixture
def sample_aptadb_data():
"""Create sample aptamer interaction data for testing."""
return pd.DataFrame(
{
"aptamer_id": ["APT001", "APT002", "APT003"],
"target_id": ["TGT001", "TGT002", "TGT003"],
"aptamer_sequence": [
"ATCGATCGATCGATCG",
"GCTAGCTAGCTAGCTA",
"TTAACCGGTTAACCGG",
],
"target_name": ["Thrombin", "VEGF", "Lysozyme"],
"target_uniprot": ["P00734", "P15692", "P61626"],
"organism": ["Homo sapiens", "Homo sapiens", "Gallus gallus"],
"ligand_type": ["Protein", "Protein", "Protein"],
"binding_conditions": ["pH 7.4, 25°C", "pH 7.0, 37°C", "pH 8.0, 25°C"],
"reference_pubmed_id": ["12345678", "87654321", "11223344"],
"interaction_present": [1, 1, 0],
}
)


def test_sample_columns(sample_aptadb_data):
"""Test that sample data contains expected columns and data types."""
df = sample_aptadb_data
assert isinstance(df, pd.DataFrame)
assert len(df) == 3

expected_columns = [
"aptamer_id",
"target_id",
"aptamer_sequence",
"target_name",
"target_uniprot",
"organism",
"ligand_type",
"binding_conditions",
"reference_pubmed_id",
"interaction_present",
]

for col in expected_columns:
assert col in df.columns, f"Expected column '{col}' not found in dataset"

assert df["aptamer_sequence"].dtype == "object"
assert df["target_name"].dtype == "object"


@pytest.mark.slow
def test_cache_consistency(tmp_path):
"""Test that consecutive calls with cache yield identical DataFrames."""
csv_path = tmp_path / "aptamer_interactions.csv"
seeded = pd.DataFrame(
{"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]}
)
seeded.to_csv(csv_path, index=False)

df1 = load_aptadb(cache_dir=tmp_path)
df2 = load_aptadb(cache_dir=tmp_path)
pd.testing.assert_frame_equal(df1, df2)
Loading
Loading