-
Notifications
You must be signed in to change notification settings - Fork 11
[ENH] aptadb loader #159
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
[ENH] aptadb loader #159
Changes from all commits
7d1741f
7411080
e168112
06dc359
c1cf45a
0d95d0b
241ce49
30f6ed1
23fbe70
12a7833
dd0c587
0f9ebfa
8873f40
fdf3b6f
ff79244
83a78d7
321d772
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -6,17 +6,23 @@ | |
| ) | ||
| from pyaptamer.datasets._loaders._csv_loader import load_csv_dataset | ||
| from pyaptamer.datasets._loaders._hf_loader import load_hf_dataset | ||
| from pyaptamer.datasets._loaders._load_aptamer import ( | ||
| load_aptadb, | ||
| load_encoders, | ||
| ) | ||
| from pyaptamer.datasets._loaders._one_gnh import load_1gnh_structure | ||
| from pyaptamer.datasets._loaders._pfoa import load_pfoa_structure | ||
|
|
||
| __all__ = [ | ||
| "load_pfoa_structure", | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. it looks like you are deleting some experts here that are unrelated to your PR; can you please revert those changes?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check now |
||
| "load_1gnh_structure", | ||
| "load_aptacom_full", | ||
| "load_aptacom_xy", | ||
| "load_aptacom_x_y", | ||
| "load_csv_dataset", | ||
| "load_hf_dataset", | ||
| "load_pfoa_structure", | ||
| "load_1gnh", | ||
| "load_1gnh_structure", | ||
| "load_aptadb", | ||
| "load_encoders", | ||
| ] | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,218 @@ | ||
| __author__ = "Satarupa22-SD" | ||
| __all__ = ["load_aptadb", "load_encoders"] | ||
|
|
||
| from pathlib import Path | ||
|
|
||
| import pandas as pd | ||
|
|
||
|
|
||
| def _download_dataset( | ||
| dataset_name: str, | ||
| target_dir: Path, | ||
| force_download: bool = False, | ||
| ) -> None: | ||
| """Download a Kaggle dataset to the specified directory and unzip it.""" | ||
| import kaggle # avoid import-time auth | ||
|
|
||
| target_dir.mkdir(parents=True, exist_ok=True) | ||
|
|
||
| # Only download if forced or no CSV files exist | ||
| if force_download or not any(target_dir.glob("*.csv")): | ||
| kaggle.api.dataset_download_files( | ||
| dataset_name, | ||
| path=str(target_dir), | ||
| unzip=True, | ||
| ) | ||
|
|
||
|
|
||
| def load_encoders( | ||
| path: str | Path, | ||
| *, | ||
| encoding: str | None = None, | ||
| **read_csv_kwargs, | ||
| ) -> pd.DataFrame: | ||
| """ | ||
| Load a CSV file into a pandas DataFrame with multi-encoding fallback. | ||
|
|
||
| This function attempts to read a CSV file using a list of common text | ||
| encodings, ensuring robust loading even when the file has ambiguous or | ||
| non-UTF-8 formatting. When ``encoding`` is explicitly provided, only that | ||
| encoding is used. Otherwise, the function tries the following encodings | ||
| in order: | ||
|
|
||
| ``["utf-8", "utf-8-sig", "latin-1", "cp1252", "windows-1252"]`` | ||
|
|
||
| Parameters | ||
| ---------- | ||
| path : str or pathlib.Path | ||
| Path to the CSV file to load. The file must exist locally. | ||
| encoding : str, optional | ||
| Specific encoding to use for reading the CSV. If ``None`` (default), | ||
| multiple encodings are tried sequentially until one succeeds. | ||
| **read_csv_kwargs | ||
| Additional keyword arguments passed directly to ``pandas.read_csv``. | ||
| Useful for specifying delimiters, NA values, column types, etc. | ||
|
|
||
| Returns | ||
| ------- | ||
| pandas.DataFrame | ||
| A DataFrame containing the parsed CSV data. | ||
|
|
||
| Raises | ||
| ------ | ||
| RuntimeError | ||
| If the file cannot be read with any of the attempted encodings. | ||
| FileNotFoundError | ||
| If the given path does not point to an existing file. | ||
|
|
||
| Examples | ||
| -------- | ||
| >>> df = load_encoders("aptamer_interactions.csv") | ||
| >>> df.head() | ||
| """ | ||
| candidate_encodings = ( | ||
| [ | ||
| "utf-8", | ||
| "utf-8-sig", | ||
| "latin-1", | ||
| "cp1252", | ||
| "windows-1252", | ||
| ] | ||
| if encoding is None | ||
| else [encoding] | ||
| ) | ||
|
|
||
| last_error: Exception | None = None | ||
|
|
||
| for enc in candidate_encodings: | ||
| try: | ||
| return pd.read_csv(path, encoding=enc, **read_csv_kwargs) | ||
| except Exception as e: # pragma: no cover - exercised via fallback | ||
| last_error = e | ||
| continue | ||
|
|
||
| raise RuntimeError( | ||
| f"Failed to read CSV {path} with encodings {candidate_encodings}: {last_error}" | ||
| ) | ||
|
|
||
|
|
||
| def load_aptadb( | ||
| dataset_name: str = "satarupadeb/aptamer-interactions", | ||
| cache_dir: str | Path | None = None, | ||
| force_download: bool = False, | ||
| *, | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the * used for?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That's just a separator
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why do we need it? Is it something done as a "best practice"? |
||
| encoding: str | None = None, | ||
| **kwargs, | ||
| ) -> pd.DataFrame: | ||
| """ | ||
| Download (if needed) and load the aptamer interaction dataset from Kaggle. | ||
|
|
||
| This function retrieves the dataset from Kaggle using the Kaggle API, | ||
| caches it locally, and loads the expected CSV file | ||
| ``aptamer_interactions.csv`` into a pandas DataFrame. | ||
|
|
||
| If the CSV already exists in the cache directory, it is used directly | ||
| unless ``force_download=True`` is provided. | ||
|
|
||
| Parameters | ||
| ---------- | ||
| dataset_name : str, optional | ||
| The Kaggle dataset identifier, formatted as | ||
| ``"username/dataset-name"``. | ||
| Default is ``"satarupadeb/aptamer-interactions"``. | ||
| cache_dir : str or pathlib.Path, optional | ||
| Directory where the dataset will be downloaded and cached. | ||
| If ``None`` (default), the cache is stored under | ||
| ``~/.pyaptamer/cache/<dataset_name>/``. | ||
| force_download : bool, default False | ||
| If ``True``, the dataset is downloaded even if a cached CSV already | ||
| exists. | ||
| encoding : str, optional | ||
| Encoding to pass to the CSV loader. If ``None``, multiple encodings | ||
| are attempted by ``load_encoders``. | ||
| **kwargs | ||
| Additional keyword arguments passed to ``load_encoders`` and | ||
| ultimately to ``pandas.read_csv``. | ||
|
|
||
| Returns | ||
| ------- | ||
| pandas.DataFrame | ||
| The loaded aptamer interactions dataset. Typical columns include: | ||
|
|
||
| - ``aptamer_id`` | ||
| - ``target_id`` | ||
| - ``aptamer_sequence`` | ||
| - ``target_name`` | ||
| - ``target_uniprot`` | ||
| - ``organism`` | ||
| - ``ligand_type`` | ||
| - ``binding_conditions`` | ||
| - ``reference_pubmed_id`` | ||
| - ``interaction_present`` | ||
|
|
||
| Raises | ||
| ------ | ||
| ImportError | ||
| If the ``kaggle`` Python package is not installed. | ||
| RuntimeError | ||
| If the dataset download fails. | ||
| FileNotFoundError | ||
| If ``aptamer_interactions.csv`` is not present after download. | ||
|
|
||
| To Be Noted | ||
| ----- | ||
| You must have Kaggle API credentials configured before using this | ||
| function, by setting the ``KAGGLE_USERNAME`` and ``KAGGLE_KEY`` environment | ||
| variables. | ||
|
|
||
| Examples | ||
| -------- | ||
| Set the required environment variables in Python: | ||
| >>> import os | ||
| >>> os.environ["KAGGLE_USERNAME"] = ( | ||
| ... "yourkaggleusername" # Replace with your username | ||
| ... ) | ||
| >>> os.environ["KAGGLE_KEY"] = "yourkaggleapi" # Replace with your Kaggle API key | ||
|
|
||
|
|
||
| Then load the dataset: | ||
|
|
||
| >>> from pyaptamer.datasets import load_aptadb | ||
| >>> df = load_aptadb() | ||
| >>> print(df.head()) | ||
| >>> df.shape | ||
| (1234, 10) | ||
|
|
||
| >>> df = load_aptadb(cache_dir="data/cache", force_download=True) | ||
| >>> df.columns | ||
| Index([...], dtype='object') | ||
| """ | ||
| if cache_dir is None: | ||
| cache_dir = ( | ||
| Path.home() / ".pyaptamer" / "cache" / dataset_name.replace("/", "_") | ||
| ) | ||
| else: | ||
| cache_dir = Path(cache_dir) | ||
|
|
||
| # The expected CSV filename in the Kaggle dataset | ||
| csv_file = cache_dir / "aptamer_interactions.csv" | ||
|
|
||
| if not csv_file.exists() or force_download: | ||
| try: | ||
| _download_dataset(dataset_name, cache_dir, force_download=force_download) | ||
| except ImportError as err: | ||
| raise ImportError( | ||
| "The 'kaggle' package is required to download datasets. " | ||
| "Install it with: pip install kaggle" | ||
| ) from err | ||
| except Exception as e: | ||
| raise RuntimeError( | ||
| f"Failed to download dataset '{dataset_name}' from Kaggle: {e}" | ||
| ) from e | ||
|
|
||
| if not csv_file.exists(): | ||
| raise FileNotFoundError( | ||
| f"Expected file 'aptamer_interactions.csv' not found at {cache_dir}" | ||
| ) | ||
|
|
||
| return load_encoders(path=str(csv_file), encoding=encoding, **kwargs) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,123 @@ | ||
| __author__ = "Satarupa22-SD" | ||
|
|
||
| from unittest.mock import patch | ||
|
|
||
| import pandas as pd | ||
| import pytest | ||
|
|
||
| from pyaptamer.datasets import load_aptadb | ||
| from pyaptamer.datasets._loaders._load_aptamer import load_encoders | ||
|
|
||
|
|
||
| def test_local_csv(tmp_path): | ||
| """Test loading aptamer data from a local CSV file.""" | ||
| csv_path = tmp_path / "aptadb_sample.csv" | ||
| pd.DataFrame( | ||
| { | ||
| "aptamer_id": ["APT001"], | ||
| "aptamer_sequence": ["AUGCUU"], | ||
| "target_name": ["Thrombin"], | ||
| "interaction_present": ["1"], | ||
| } | ||
| ).to_csv(csv_path, index=False) | ||
|
|
||
| df = load_encoders(csv_path) | ||
| assert isinstance(df, pd.DataFrame) | ||
| assert not df.empty | ||
| assert df.loc[0, "aptamer_sequence"] == "AUGCUU" | ||
|
|
||
|
|
||
| def test_uses_cache(tmp_path): | ||
| """Test that cached data is used instead of downloading.""" | ||
| csv_path = tmp_path / "aptamer_interactions.csv" | ||
| pd.DataFrame({"aptamer_sequence": ["AUGU"], "target_name": ["X"]}).to_csv( | ||
| csv_path, index=False | ||
| ) | ||
|
|
||
| df = load_aptadb(cache_dir=tmp_path) | ||
| assert not df.empty | ||
| assert df.loc[0, "aptamer_sequence"] == "AUGU" | ||
|
|
||
|
|
||
| def test_requires_kaggle(tmp_path): | ||
| """Test that ImportError is raised when kaggle package is missing.""" | ||
| # Ensure no CSV present so a download would be attempted | ||
| with patch.dict("sys.modules", {"kaggle": None}): | ||
| with pytest.raises(ImportError): | ||
| load_aptadb(cache_dir=tmp_path) | ||
|
|
||
|
|
||
| def test_invalid_dataset(tmp_path): | ||
| """Test error handling for invalid dataset download.""" | ||
| # Force the download to fail | ||
| with patch( | ||
| "pyaptamer.datasets._loaders._load_aptamer._download_dataset", | ||
| side_effect=Exception("boom"), | ||
| ): | ||
| with pytest.raises( | ||
| RuntimeError, match=r"Failed to download dataset .* from Kaggle" | ||
| ): | ||
| load_aptadb("nonexistent/invalid-dataset", cache_dir=tmp_path) | ||
|
|
||
|
|
||
| @pytest.fixture | ||
| def sample_aptadb_data(): | ||
| """Create sample aptamer interaction data for testing.""" | ||
| return pd.DataFrame( | ||
| { | ||
| "aptamer_id": ["APT001", "APT002", "APT003"], | ||
| "target_id": ["TGT001", "TGT002", "TGT003"], | ||
| "aptamer_sequence": [ | ||
| "ATCGATCGATCGATCG", | ||
| "GCTAGCTAGCTAGCTA", | ||
| "TTAACCGGTTAACCGG", | ||
| ], | ||
| "target_name": ["Thrombin", "VEGF", "Lysozyme"], | ||
| "target_uniprot": ["P00734", "P15692", "P61626"], | ||
| "organism": ["Homo sapiens", "Homo sapiens", "Gallus gallus"], | ||
| "ligand_type": ["Protein", "Protein", "Protein"], | ||
| "binding_conditions": ["pH 7.4, 25°C", "pH 7.0, 37°C", "pH 8.0, 25°C"], | ||
| "reference_pubmed_id": ["12345678", "87654321", "11223344"], | ||
| "interaction_present": [1, 1, 0], | ||
| } | ||
| ) | ||
|
|
||
|
|
||
| def test_sample_columns(sample_aptadb_data): | ||
| """Test that sample data contains expected columns and data types.""" | ||
| df = sample_aptadb_data | ||
| assert isinstance(df, pd.DataFrame) | ||
| assert len(df) == 3 | ||
|
|
||
| expected_columns = [ | ||
| "aptamer_id", | ||
| "target_id", | ||
| "aptamer_sequence", | ||
| "target_name", | ||
| "target_uniprot", | ||
| "organism", | ||
| "ligand_type", | ||
| "binding_conditions", | ||
| "reference_pubmed_id", | ||
| "interaction_present", | ||
| ] | ||
|
|
||
| for col in expected_columns: | ||
| assert col in df.columns, f"Expected column '{col}' not found in dataset" | ||
|
|
||
| assert df["aptamer_sequence"].dtype == "object" | ||
| assert df["target_name"].dtype == "object" | ||
|
|
||
|
|
||
| @pytest.mark.slow | ||
| def test_cache_consistency(tmp_path): | ||
| """Test that consecutive calls with cache yield identical DataFrames.""" | ||
| csv_path = tmp_path / "aptamer_interactions.csv" | ||
| seeded = pd.DataFrame( | ||
| {"aptamer_sequence": ["AU"], "target_name": ["X"], "interaction_present": [0]} | ||
| ) | ||
| seeded.to_csv(csv_path, index=False) | ||
|
|
||
| df1 = load_aptadb(cache_dir=tmp_path) | ||
| df2 = load_aptadb(cache_dir=tmp_path) | ||
| pd.testing.assert_frame_equal(df1, df2) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
should not be removed
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I have added everything back.