-
Couldn't load subscription status.
- Fork 12
Move to adaptor backend #298
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 12 commits
9dd8beb
040da5e
4ba393d
7898ce7
4a3ea01
007ad3a
d577b02
3aaabbb
56c3285
0171d72
fe6092f
1289134
1d5c47f
d0fa9c9
81f6779
1540500
dd49569
daa4239
f11141a
13d356e
82ba58a
18818f6
dc683dd
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,172 @@ | ||
| from __future__ import annotations | ||
|
|
||
| import json | ||
| from abc import abstractmethod | ||
| from typing import TYPE_CHECKING, Any, ClassVar, TypeAlias, overload | ||
|
|
||
| from databackend import AbstractBackend | ||
|
|
||
| if TYPE_CHECKING: | ||
| import pandas as pd | ||
|
|
||
| _PandasDataFrame: TypeAlias = pd.DataFrame | ||
| _DataFrame: TypeAlias = _PandasDataFrame | ||
|
|
||
|
|
||
| class _AbstractPandasFrame(AbstractBackend): | ||
| _backends = [("pandas", "DataFrame")] | ||
|
|
||
|
|
||
| _AbstractDF: TypeAlias = _AbstractPandasFrame | ||
|
|
||
|
|
||
| class _Adaptor: | ||
| _d: ClassVar[Any] | ||
|
||
|
|
||
| def __init__(self, data: Any) -> None: | ||
| self._d = data | ||
|
|
||
| @overload | ||
| def write_json(self, file: str) -> None: ... | ||
|
||
| @overload | ||
| def write_json(self, file: None = ...) -> str: ... | ||
| def write_json(self, file: str | None = None) -> str | None: | ||
| if file is None: | ||
| msg = ( | ||
| f"Writing to JSON string rather than file is not supported for " | ||
| f"{type(self._d)}" | ||
| ) | ||
| raise NotImplementedError(msg) | ||
|
|
||
| import json | ||
|
|
||
| json.dump(self._d, open(file, mode="w")) | ||
|
|
||
| def write_joblib(self, file: str) -> None: | ||
| import joblib | ||
|
|
||
| joblib.dump(self._d, file) | ||
|
|
||
| def write_csv(self, file: str) -> None: | ||
| msg = f"Writing to CSV is not supported for {type(self._d)}" | ||
| raise NotImplementedError(msg) | ||
|
|
||
| def write_parquet(self, file: str) -> None: | ||
| msg = f"Writing to Parquet is not supported for {type(self._d)}" | ||
| raise NotImplementedError(msg) | ||
|
|
||
| def write_feather(self, file: str) -> None: | ||
| msg = f"Writing to Feather is not supported for {type(self._d)}" | ||
| raise NotImplementedError(msg) | ||
|
|
||
| @property | ||
| def data_preview(self) -> str: | ||
| # note that the R library uses jsonlite::toJSON | ||
| import json | ||
|
|
||
| # TODO(compat): set display none in index.html | ||
| return json.dumps({}) | ||
|
|
||
| def default_title(self, name: str) -> str: | ||
| # TODO(compat): title says CSV rather than data.frame | ||
| # see https://github.com/machow/pins-python/issues/5 | ||
| return f"{name}: a pinned {self._obj_name}" | ||
|
|
||
| @property | ||
| def _obj_name(self) -> str: | ||
| return f"{type(self._d).__qualname__} object" | ||
|
|
||
|
|
||
| class _DFAdaptor(_Adaptor): | ||
| _d: ClassVar[_DataFrame] | ||
|
|
||
| def __init__(self, data: _DataFrame) -> None: | ||
| super().__init__(data) | ||
|
|
||
| @property | ||
| def df_type(self) -> str: | ||
| # Consider over-riding this for specialized dataframes | ||
| return "DataFrame" | ||
|
|
||
| @property | ||
| @abstractmethod | ||
| def columns(self) -> list[Any]: ... | ||
|
|
||
| @property | ||
| @abstractmethod | ||
| def shape(self) -> tuple[int, int]: ... | ||
|
|
||
| @abstractmethod | ||
| def head(self, n: int) -> _DFAdaptor: ... | ||
|
|
||
| @property | ||
| def data_preview(self) -> str: | ||
| # TODO(compat) is 100 hard-coded? | ||
| # Note that we go df -> json -> dict, to take advantage of type conversions in the dataframe library | ||
| data: list[dict[Any, Any]] = json.loads(self.head(100).write_json()) | ||
| columns = [ | ||
| {"name": [col], "label": [col], "align": ["left"], "type": [""]} | ||
| for col in self.columns | ||
| ] | ||
|
|
||
| # this reproduces R pins behavior, by omitting entries that would be null | ||
| data_no_nulls = [{k: v for k, v in row.items() if v is not None} for row in data] | ||
|
|
||
| return json.dumps({"data": data_no_nulls, "columns": columns}) | ||
|
|
||
| @property | ||
| def _obj_name(self) -> str: | ||
| row, col = self.shape | ||
| return f"{row} x {col} {self.df_type}" | ||
|
|
||
|
|
||
| class _PandasAdaptor(_DFAdaptor): | ||
| _d: ClassVar[_PandasDataFrame] | ||
|
|
||
| def __init__(self, data: _AbstractPandasFrame) -> None: | ||
| super().__init__(data) | ||
|
|
||
| @property | ||
| def columns(self) -> list[Any]: | ||
| return self._d.columns.tolist() | ||
|
|
||
| @property | ||
| def shape(self) -> tuple[int, int]: | ||
| return self._d.shape | ||
|
|
||
| def head(self, n: int) -> _PandasAdaptor: | ||
| return _PandasAdaptor(self._d.head(n)) | ||
|
|
||
| @overload | ||
| def write_json(self, file: str) -> None: ... | ||
| @overload | ||
| def write_json(self, file: None) -> str: ... | ||
| def write_json(self, file: str | None = None) -> str | None: | ||
|
||
| if file is not None: | ||
| msg = ( | ||
| f"Writing to file rather than JSON string is not supported for " | ||
| f"{type(self._d)}" | ||
| ) | ||
| raise NotImplementedError(msg) | ||
|
|
||
| return self._d.to_json(orient="records") | ||
|
|
||
| def write_csv(self, file: str) -> None: | ||
| self._d.to_csv(file, index=False) | ||
|
|
||
| def write_parquet(self, file: str) -> None: | ||
| self._d.to_parquet(file) | ||
|
|
||
| def write_feather(self, file: str) -> None: | ||
| self._d.to_feather(file) | ||
|
|
||
|
|
||
| @overload | ||
| def _create_adaptor(obj: _DataFrame) -> _DFAdaptor: ... | ||
| @overload | ||
| def _create_adaptor(obj: Any) -> _Adaptor: ... | ||
| def _create_adaptor(obj: Any | _DataFrame) -> _Adaptor | _DFAdaptor: | ||
| if isinstance(obj, _AbstractPandasFrame): | ||
| return _PandasAdaptor(obj) | ||
| else: | ||
| return _Adaptor(obj) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,8 @@ | ||
| from collections.abc import Sequence | ||
| from pathlib import Path | ||
| from typing import Any | ||
|
|
||
| from pins._adaptors import _create_adaptor | ||
|
|
||
| from .config import PINS_ENV_INSECURE_READ, get_allow_pickle_read | ||
| from .errors import PinsInsecureReadError | ||
|
|
@@ -13,15 +16,6 @@ | |
| REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib"]) | ||
|
|
||
|
|
||
| def _assert_is_pandas_df(x, file_type: str) -> None: | ||
| import pandas as pd | ||
|
|
||
| if not isinstance(x, pd.DataFrame): | ||
| raise NotImplementedError( | ||
| f"Currently only pandas.DataFrame can be saved as type {file_type!r}." | ||
| ) | ||
|
|
||
|
|
||
| def load_path(filename: str, path_to_version, pin_type=None): | ||
| # file path creation ------------------------------------------------------ | ||
| if pin_type == "table": | ||
|
|
@@ -135,6 +129,8 @@ def save_data( | |
| # as argument to board, and then type dispatchers for explicit cases | ||
| # of saving / loading objects different ways. | ||
|
|
||
| adaptor = _create_adaptor(obj) | ||
|
||
|
|
||
| if apply_suffix: | ||
| if pin_type == "file": | ||
| suffix = "".join(Path(obj).suffixes) | ||
|
|
@@ -149,39 +145,22 @@ def save_data( | |
| final_name = f"{fname}{suffix}" | ||
|
|
||
| if pin_type == "csv": | ||
| _assert_is_pandas_df(obj, file_type=type) | ||
|
|
||
| obj.to_csv(final_name, index=False) | ||
|
|
||
| adaptor.write_csv(final_name) | ||
| elif pin_type == "arrow": | ||
| # NOTE: R pins accepts the type arrow, and saves it as feather. | ||
| # we allow reading this type, but raise an error for writing. | ||
| _assert_is_pandas_df(obj, file_type=type) | ||
|
|
||
| obj.to_feather(final_name) | ||
|
|
||
| adaptor.write_feather(final_name) | ||
| elif pin_type == "feather": | ||
| _assert_is_pandas_df(obj, file_type=type) | ||
|
|
||
| raise NotImplementedError( | ||
| msg = ( | ||
machow marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| 'Saving data as type "feather" no longer supported. Use type "arrow" instead.' | ||
| ) | ||
|
|
||
| raise NotImplementedError(msg) | ||
| elif pin_type == "parquet": | ||
| _assert_is_pandas_df(obj, file_type=type) | ||
|
|
||
| obj.to_parquet(final_name) | ||
|
|
||
| adaptor.write_parquet(final_name) | ||
| elif pin_type == "joblib": | ||
| import joblib | ||
|
|
||
| joblib.dump(obj, final_name) | ||
|
|
||
| adaptor.write_joblib(final_name) | ||
| elif pin_type == "json": | ||
| import json | ||
|
|
||
| json.dump(obj, open(final_name, "w")) | ||
|
|
||
| adaptor.write_json(final_name) | ||
| elif pin_type == "file": | ||
| import contextlib | ||
| import shutil | ||
|
|
@@ -202,14 +181,6 @@ def save_data( | |
| return final_name | ||
|
|
||
|
|
||
| def default_title(obj, name): | ||
| import pandas as pd | ||
|
|
||
| if isinstance(obj, pd.DataFrame): | ||
| # TODO(compat): title says CSV rather than data.frame | ||
| # see https://github.com/machow/pins-python/issues/5 | ||
| shape_str = " x ".join(map(str, obj.shape)) | ||
| return f"{name}: a pinned {shape_str} DataFrame" | ||
| else: | ||
| obj_name = type(obj).__qualname__ | ||
| return f"{name}: a pinned {obj_name} object" | ||
| def default_title(obj: Any, name: str) -> str: | ||
| # Kept for backward compatibility only. | ||
| return _create_adaptor(obj).default_title(name) | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Since the file
_adaptors.pystarts with an underscore, it seems okay for the contents to not use an underscore (e.g.AbstractPandasFrame).(though also totally okay to punt this, since it's all internal; especially if other PRs are building on this one)
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yeah I agree with that.