gc-os-ai · Satarupa22-SD · Sep 25, 2025 · Sep 25, 2025 · Oct 4, 2025 · Oct 5, 2025
diff --git a/pyaptamer/datasets/__init__.py b/pyaptamer/datasets/__init__.py
@@ -6,13 +6,17 @@
 )
 from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
 from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
+from pyaptamer.datasets._loaders._load_aptamer import (
+    load_aptadb,
+    load_encoders,
+)
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh, load_1gnh_structure
 from pyaptamer.datasets._loaders._online_databank import load_from_rcsb
 from pyaptamer.datasets._loaders._pfoa import load_pfoa, load_pfoa_structure
 
 __all__ = [
     "load_aptacom_full",
-    "load_aptacom_xy",
+    "load_aptacom_x_y",
     "load_csv_dataset",
     "load_hf_dataset",
     "load_pfoa",
@@ -21,4 +25,6 @@
     "load_1gnh_structure",
     "load_from_rcsb",
     "load_csv_dataset",
+    "load_aptadb",
+    "load_encoders",
 ]
diff --git a/pyaptamer/datasets/_loaders/__init__.py b/pyaptamer/datasets/_loaders/__init__.py
@@ -6,17 +6,23 @@
 )
 from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset
 from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset
+from pyaptamer.datasets._loaders._load_aptamer import (
+    load_aptadb,
+    load_encoders,
+)
 from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure
 from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure
 
 __all__ = [
     "load_pfoa_structure",
     "load_1gnh_structure",
     "load_aptacom_full",
-    "load_aptacom_xy",
+    "load_aptacom_x_y",
     "load_csv_dataset",
     "load_hf_dataset",
     "load_pfoa_structure",
     "load_1gnh",
     "load_1gnh_structure",
+    "load_aptadb",
+    "load_encoders",
 ]
diff --git a/pyaptamer/datasets/_loaders/_load_aptamer.py b/pyaptamer/datasets/_loaders/_load_aptamer.py
@@ -0,0 +1,218 @@
+__author__ = "Satarupa22-SD"
+__all__ = ["load_aptadb", "load_encoders"]
+
+from pathlib import Path
+
+import pandas as pd
+
+
+def _download_dataset(
+    dataset_name: str,
+    target_dir: Path,
+    force_download: bool = False,
+) -> None:
+    """Download a Kaggle dataset to the specified directory and unzip it."""
+    import kaggle  # avoid import-time auth
+
+    target_dir.mkdir(parents=True, exist_ok=True)
+
+    # Only download if forced or no CSV files exist
+    if force_download or not any(target_dir.glob("*.csv")):
+        kaggle.api.dataset_download_files(
+            dataset_name,
+            path=str(target_dir),
+            unzip=True,
+        )
+
+
+def load_encoders(
+    path: str | Path,
+    *,
+    encoding: str | None = None,
+    **read_csv_kwargs,
+) -> pd.DataFrame:
+    """
+    Load a CSV file into a pandas DataFrame with multi-encoding fallback.
+
+    This function attempts to read a CSV file using a list of common text
+    encodings, ensuring robust loading even when the file has ambiguous or
+    non-UTF-8 formatting. When ``encoding`` is explicitly provided, only that
+    encoding is used. Otherwise, the function tries the following encodings
+    in order:
+
+    ``["utf-8", "utf-8-sig", "latin-1", "cp1252", "windows-1252"]``
+
+    Parameters
+    ----------
+    path : str or pathlib.Path
+        Path to the CSV file to load. The file must exist locally.
+    encoding : str, optional
+        Specific encoding to use for reading the CSV. If ``None`` (default),
+        multiple encodings are tried sequentially until one succeeds.
+    **read_csv_kwargs
+        Additional keyword arguments passed directly to ``pandas.read_csv``.
+        Useful for specifying delimiters, NA values, column types, etc.
+
+    Returns
+    -------
+    pandas.DataFrame
+        A DataFrame containing the parsed CSV data.
+
+    Raises
+    ------
+    RuntimeError
+        If the file cannot be read with any of the attempted encodings.
+    FileNotFoundError
+        If the given path does not point to an existing file.
+
+    Examples
+    --------
+    >>> df = load_encoders("aptamer_interactions.csv")
+    >>> df.head()
+    """
+    candidate_encodings = (
+        [
+            "utf-8",
+            "utf-8-sig",
+            "latin-1",
+            "cp1252",
+            "windows-1252",
+        ]
+        if encoding is None
+        else [encoding]
+    )
+
+    last_error: Exception | None = None
+
+    for enc in candidate_encodings:
+        try:
+            return pd.read_csv(path, encoding=enc, **read_csv_kwargs)
+        except Exception as e:  # pragma: no cover - exercised via fallback
+            last_error = e
+            continue
+
+    raise RuntimeError(
+        f"Failed to read CSV {path} with encodings {candidate_encodings}: {last_error}"
+    )
+
+
+def load_aptadb(
+    dataset_name: str = "satarupadeb/aptamer-interactions",
+    cache_dir: str | Path | None = None,
+    force_download: bool = False,
+    *,
+    encoding: str | None = None,
+    **kwargs,
+) -> pd.DataFrame:
+    """
+    Download (if needed) and load the aptamer interaction dataset from Kaggle.
+
+    This function retrieves the dataset from Kaggle using the Kaggle API,
+    caches it locally, and loads the expected CSV file
+    ``aptamer_interactions.csv`` into a pandas DataFrame.
+
+    If the CSV already exists in the cache directory, it is used directly
+    unless ``force_download=True`` is provided.
+
+    Parameters
+    ----------
+    dataset_name : str, optional
+        The Kaggle dataset identifier, formatted as
+        ``"username/dataset-name"``.
+        Default is ``"satarupadeb/aptamer-interactions"``.
+    cache_dir : str or pathlib.Path, optional
+        Directory where the dataset will be downloaded and cached.
+        If ``None`` (default), the cache is stored under
+        ``~/.pyaptamer/cache/<dataset_name>/``.
+    force_download : bool, default False
+        If ``True``, the dataset is downloaded even if a cached CSV already
+        exists.
+    encoding : str, optional
+        Encoding to pass to the CSV loader. If ``None``, multiple encodings
+        are attempted by ``load_encoders``.
+    **kwargs
+        Additional keyword arguments passed to ``load_encoders`` and
+        ultimately to ``pandas.read_csv``.
+
+    Returns
+    -------
+    pandas.DataFrame
+        The loaded aptamer interactions dataset. Typical columns include:
+
+        - ``aptamer_id``
+        - ``target_id``
+        - ``aptamer_sequence``
+        - ``target_name``
+        - ``target_uniprot``
+        - ``organism``
+        - ``ligand_type``
+        - ``binding_conditions``
+        - ``reference_pubmed_id``
+        - ``interaction_present``
+
+    Raises
+    ------
+    ImportError
+        If the ``kaggle`` Python package is not installed.
+    RuntimeError
+        If the dataset download fails.
+    FileNotFoundError
+        If ``aptamer_interactions.csv`` is not present after download.
+
+    To Be Noted
+    -----
+    You must have Kaggle API credentials configured before using this
+    function, by setting the ``KAGGLE_USERNAME`` and ``KAGGLE_KEY`` environment
+    variables.
+
+    Examples
+    --------
+    Set the required environment variables in Python:
+    >>> import os
+    >>> os.environ["KAGGLE_USERNAME"] = (
+    ...     "yourkaggleusername"  # Replace with your username
+    ... )
+    >>> os.environ["KAGGLE_KEY"] = "yourkaggleapi"  # Replace with your Kaggle API key
+
+
+    Then load the dataset:
+
+    >>> from pyaptamer.datasets import load_aptadb
+    >>> df = load_aptadb()
+    >>> print(df.head())
+    >>> df.shape
+    (1234, 10)
+
+    >>> df = load_aptadb(cache_dir="data/cache", force_download=True)
+    >>> df.columns
+    Index([...], dtype='object')
+    """
+    if cache_dir is None:
+        cache_dir = (
+            Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_")
+        )
+    else:
+        cache_dir = Path(cache_dir)
+
+    # The expected CSV filename in the Kaggle dataset
+    csv_file = cache_dir / "aptamer_interactions.csv"
+
+    if not csv_file.exists() or force_download:
+        try:
+            _download_dataset(dataset_name, cache_dir, force_download=force_download)
+        except ImportError as err:
+            raise ImportError(
+                "The 'kaggle' package is required to download datasets. "
+                "Install it with: pip install kaggle"
+            ) from err
+        except Exception as e:
+            raise RuntimeError(
+                f"Failed to download dataset '{dataset_name}' from Kaggle: {e}"
+            ) from e
+
+        if not csv_file.exists():
+            raise FileNotFoundError(
+                f"Expected file 'aptamer_interactions.csv' not found at {cache_dir}"
+            )
+
+    return load_encoders(path=str(csv_file), encoding=encoding, **kwargs)
diff --git a/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py b/pyaptamer/datasets/tests/test_aptamer_interactions_loader.py
@@ -0,0 +1,123 @@
+__author__ = "Satarupa22-SD"
+
+from unittest.mock import patch
+
+import pandas as pd
+import pytest
+
+from pyaptamer.datasets import load_aptadb
+from pyaptamer.datasets._loaders._load_aptamer import load_encoders
+
+
+def test_local_csv(tmp_path):
+    """Test loading aptamer data from a local CSV file."""
+    csv_path = tmp_path / "aptadb_sample.csv"
+    pd.DataFrame(
+        {
+            "aptamer_id": ["APT001"],
+            "aptamer_sequence": ["AUGCUU"],
+            "target_name": ["Thrombin"],
+            "interaction_present": ["1"],
+        }
+    ).to_csv(csv_path, index=False)
+
+    df = load_encoders(csv_path)
+    assert isinstance(df, pd.DataFrame)
+    assert not df.empty
+    assert df.loc[0, "aptamer_sequence"] == "AUGCUU"
+
+
+def test_uses_cache(tmp_path):
+    """Test that cached data is used instead of downloading."""
+    csv_path = tmp_path / "aptamer_interactions.csv"
+    pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv(
+        csv_path, index=False
+    )
+
+    df = load_aptadb(cache_dir=tmp_path)
+    assert not df.empty
+    assert df.loc[0, "aptamer_sequence"] == "AUGU"
+
+
+def test_requires_kaggle(tmp_path):
+    """Test that ImportError is raised when kaggle package is missing."""
+    # Ensure no CSV present so a download would be attempted
+    with patch.dict("sys.modules", {"kaggle": None}):
+        with pytest.raises(ImportError):
+            load_aptadb(cache_dir=tmp_path)
+
+
+def test_invalid_dataset(tmp_path):
+    """Test error handling for invalid dataset download."""
+    # Force the download to fail
+    with patch(
+        "pyaptamer.datasets._loaders._load_aptamer._download_dataset",
+        side_effect=Exception("boom"),
+    ):
+        with pytest.raises(
+            RuntimeError, match=r"Failed to download dataset .* from Kaggle"
+        ):
+            load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path)
+
+
+@pytest.fixture
+def sample_aptadb_data():
+    """Create sample aptamer interaction data for testing."""
+    return pd.DataFrame(
+        {
+            "aptamer_id": ["APT001", "APT002", "APT003"],
+            "target_id": ["TGT001", "TGT002", "TGT003"],
+            "aptamer_sequence": [
+                "ATCGATCGATCGATCG",
+                "GCTAGCTAGCTAGCTA",
+                "TTAACCGGTTAACCGG",
+            ],
+            "target_name": ["Thrombin", "VEGF", "Lysozyme"],
+            "target_uniprot": ["P00734", "P15692", "P61626"],
+            "organism": ["Homo sapiens", "Homo sapiens", "Gallus gallus"],
+            "ligand_type": ["Protein", "Protein", "Protein"],
+            "binding_conditions": ["pH 7.4, 25°C", "pH 7.0, 37°C", "pH 8.0, 25°C"],
+            "reference_pubmed_id": ["12345678", "87654321", "11223344"],
+            "interaction_present": [1, 1, 0],
+        }
+    )
+
+
+def test_sample_columns(sample_aptadb_data):
+    """Test that sample data contains expected columns and data types."""
+    df = sample_aptadb_data
+    assert isinstance(df, pd.DataFrame)
+    assert len(df) == 3
+
+    expected_columns = [
+        "aptamer_id",
+        "target_id",
+        "aptamer_sequence",
+        "target_name",
+        "target_uniprot",
+        "organism",
+        "ligand_type",
+        "binding_conditions",
+        "reference_pubmed_id",
+        "interaction_present",
+    ]
+
+    for col in expected_columns:
+        assert col in df.columns, f"Expected column '{col}' not found in dataset"
+
+    assert df["aptamer_sequence"].dtype == "object"
+    assert df["target_name"].dtype == "object"
+
+
+@pytest.mark.slow
+def test_cache_consistency(tmp_path):
+    """Test that consecutive calls with cache yield identical DataFrames."""
+    csv_path = tmp_path / "aptamer_interactions.csv"
+    seeded = pd.DataFrame(
+        {"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]}
+    )
+    seeded.to_csv(csv_path, index=False)
+
+    df1 = load_aptadb(cache_dir=tmp_path)
+    df2 = load_aptadb(cache_dir=tmp_path)
+    pd.testing.assert_frame_equal(df1, df2)