Skip to content
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
9dd8beb
Support adaptor in prepare_pin_version
nathanjmcdougall Aug 13, 2024
040da5e
Use adaptor in save_data
nathanjmcdougall Aug 13, 2024
4ba393d
Use adaptor for default_title
nathanjmcdougall Aug 13, 2024
7898ce7
underscore prefix for _adaptors.py; abstracting df_type in default_title
nathanjmcdougall Aug 13, 2024
4a3ea01
Removing duplication in _obj_name definition
nathanjmcdougall Aug 13, 2024
007ad3a
Use adaptor in _create_meta
nathanjmcdougall Aug 13, 2024
d577b02
Pass pyright
nathanjmcdougall Aug 13, 2024
3aaabbb
Fix broken import
nathanjmcdougall Aug 13, 2024
56c3285
Refactoring type hints to avoid use of Self
nathanjmcdougall Aug 20, 2024
0171d72
Remove singleton Union
nathanjmcdougall Aug 20, 2024
fe6092f
Add databackend as a dependency
nathanjmcdougall Aug 27, 2024
1289134
Merge branch 'main' into feature/move-to-adaptor-backend
nathanjmcdougall Dec 17, 2024
1d5c47f
dev: add ruff to pyproject.toml
machow Apr 1, 2025
d0fa9c9
feat: allow save_data to accept an Adaptor
machow Apr 1, 2025
81f6779
Remove unnecessary underscores
nathanjmcdougall Apr 1, 2025
1540500
Remove misleading/unnecessary ClassVar declaration
nathanjmcdougall Apr 1, 2025
dd49569
Merge branch 'feature/move-to-adaptor-backend' of https://github.com/…
nathanjmcdougall Apr 1, 2025
daa4239
Separate write_json from to_json (CQS)
nathanjmcdougall Apr 1, 2025
f11141a
Move calls to create_adapter to hide them at a lower level
nathanjmcdougall Apr 1, 2025
13d356e
Add some tests
nathanjmcdougall Apr 1, 2025
82ba58a
Merge branch 'rstudio:main' into feature/move-to-adaptor-backend
nathanjmcdougall Apr 30, 2025
18818f6
Use backported typing_extensions.TypeAlias for Python 3.9
nathanjmcdougall Jun 3, 2025
dc683dd
add typing_extensions
isabelizimm Jun 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
172 changes: 172 additions & 0 deletions pins/_adaptors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
from __future__ import annotations

import json
from abc import abstractmethod
from typing import TYPE_CHECKING, Any, ClassVar, TypeAlias, overload

from databackend import AbstractBackend

if TYPE_CHECKING:
import pandas as pd

_PandasDataFrame: TypeAlias = pd.DataFrame
_DataFrame: TypeAlias = _PandasDataFrame


class _AbstractPandasFrame(AbstractBackend):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the file _adaptors.py starts with an underscore, it seems okay for the contents to not use an underscore (e.g. AbstractPandasFrame).

(though also totally okay to punt this, since it's all internal; especially if other PRs are building on this one)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I agree with that.

_backends = [("pandas", "DataFrame")]


_AbstractDF: TypeAlias = _AbstractPandasFrame


class _Adaptor:
_d: ClassVar[Any]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is the use of ClassVar right here? It seems like _d is not a class variable (it's set on the instance).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah also agreed. I can't recall why I did that, it might have been some misguided thoughts about pyright behaviour.


def __init__(self, data: Any) -> None:
self._d = data

@overload
def write_json(self, file: str) -> None: ...
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This behavior was modified to sometimes return a string, which seems to violate command-query separation (maybe to reflect the implementation in the pandas adaptor?). Can we revert so that the adaptor refactors, but does not extend the original behavior?

edit: see comment in _DFAdaptor.write_json()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Totally, good idea.

@overload
def write_json(self, file: None = ...) -> str: ...
def write_json(self, file: str | None = None) -> str | None:
if file is None:
msg = (
f"Writing to JSON string rather than file is not supported for "
f"{type(self._d)}"
)
raise NotImplementedError(msg)

import json

json.dump(self._d, open(file, mode="w"))

def write_joblib(self, file: str) -> None:
import joblib

joblib.dump(self._d, file)

def write_csv(self, file: str) -> None:
msg = f"Writing to CSV is not supported for {type(self._d)}"
raise NotImplementedError(msg)

def write_parquet(self, file: str) -> None:
msg = f"Writing to Parquet is not supported for {type(self._d)}"
raise NotImplementedError(msg)

def write_feather(self, file: str) -> None:
msg = f"Writing to Feather is not supported for {type(self._d)}"
raise NotImplementedError(msg)

@property
def data_preview(self) -> str:
# note that the R library uses jsonlite::toJSON
import json

# TODO(compat): set display none in index.html
return json.dumps({})

def default_title(self, name: str) -> str:
# TODO(compat): title says CSV rather than data.frame
# see https://github.com/machow/pins-python/issues/5
return f"{name}: a pinned {self._obj_name}"

@property
def _obj_name(self) -> str:
return f"{type(self._d).__qualname__} object"


class _DFAdaptor(_Adaptor):
_d: ClassVar[_DataFrame]

def __init__(self, data: _DataFrame) -> None:
super().__init__(data)

@property
def df_type(self) -> str:
# Consider over-riding this for specialized dataframes
return "DataFrame"

@property
@abstractmethod
def columns(self) -> list[Any]: ...

@property
@abstractmethod
def shape(self) -> tuple[int, int]: ...

@abstractmethod
def head(self, n: int) -> _DFAdaptor: ...

@property
def data_preview(self) -> str:
# TODO(compat) is 100 hard-coded?
# Note that we go df -> json -> dict, to take advantage of type conversions in the dataframe library
data: list[dict[Any, Any]] = json.loads(self.head(100).write_json())
columns = [
{"name": [col], "label": [col], "align": ["left"], "type": [""]}
for col in self.columns
]

# this reproduces R pins behavior, by omitting entries that would be null
data_no_nulls = [{k: v for k, v in row.items() if v is not None} for row in data]

return json.dumps({"data": data_no_nulls, "columns": columns})

@property
def _obj_name(self) -> str:
row, col = self.shape
return f"{row} x {col} {self.df_type}"


class _PandasAdaptor(_DFAdaptor):
_d: ClassVar[_PandasDataFrame]

def __init__(self, data: _AbstractPandasFrame) -> None:
super().__init__(data)

@property
def columns(self) -> list[Any]:
return self._d.columns.tolist()

@property
def shape(self) -> tuple[int, int]:
return self._d.shape

def head(self, n: int) -> _PandasAdaptor:
return _PandasAdaptor(self._d.head(n))

@overload
def write_json(self, file: str) -> None: ...
@overload
def write_json(self, file: None) -> str: ...
def write_json(self, file: str | None = None) -> str | None:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Okay I think I understand -- this looks like an override of the original .write_json() method, but its job seems different (it's used for the data preview). Its job looks more like the added .shape or .columns properties.

Can we do this?:

  • rename .write_json() here to something else (maybe .to_json())
  • change the parent annotation for write_json() to always return None (not sometimes a str)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

See what I've done in this commit:

daa4239

if file is not None:
msg = (
f"Writing to file rather than JSON string is not supported for "
f"{type(self._d)}"
)
raise NotImplementedError(msg)

return self._d.to_json(orient="records")

def write_csv(self, file: str) -> None:
self._d.to_csv(file, index=False)

def write_parquet(self, file: str) -> None:
self._d.to_parquet(file)

def write_feather(self, file: str) -> None:
self._d.to_feather(file)


@overload
def _create_adaptor(obj: _DataFrame) -> _DFAdaptor: ...
@overload
def _create_adaptor(obj: Any) -> _Adaptor: ...
def _create_adaptor(obj: Any | _DataFrame) -> _Adaptor | _DFAdaptor:
if isinstance(obj, _AbstractPandasFrame):
return _PandasAdaptor(obj)
else:
return _Adaptor(obj)
48 changes: 13 additions & 35 deletions pins/boards.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from importlib_resources import files
from importlib_resources.abc import Traversable

from ._adaptors import _Adaptor, _create_adaptor
from .cache import PinsCache
from .config import get_allow_rsc_short_name
from .drivers import REQUIRES_SINGLE_FILE, default_title, load_data, load_file, save_data
Expand All @@ -25,6 +26,8 @@

_log = logging.getLogger(__name__)

_ = default_title # Keep this import for backward compatibility


class IFileSystem(Protocol):
protocol: str | list
Expand Down Expand Up @@ -695,6 +698,8 @@ def prepare_pin_version(
created: datetime | None = None,
object_name: str | list[str] | None = None,
):
x = _create_adaptor(x)

meta = self._create_meta(
pin_dir_path,
x,
Expand All @@ -716,7 +721,7 @@ def prepare_pin_version(
def _create_meta(
self,
pin_dir_path,
x,
x: _Adaptor,
name: str | None = None,
type: str | None = None,
title: str | None = None,
Expand All @@ -733,7 +738,7 @@ def _create_meta(
raise NotImplementedError("Type argument is required.")

if title is None:
title = default_title(x, name)
title = x.default_title(name)

# create metadata from object on disk ---------------------------------
# save all pin data to a temporary folder (including data.txt), so we
Expand All @@ -749,7 +754,7 @@ def _create_meta(
else:
p_obj = str(Path(pin_dir_path) / object_name)
# file is saved locally in order to hash, calc size
file_names = save_data(x, p_obj, type, apply_suffix)
file_names = save_data(x._d, p_obj, type, apply_suffix)

meta = self.meta_factory.create(
pin_dir_path,
Expand Down Expand Up @@ -1199,14 +1204,16 @@ def user_name(self):
return self.fs.api.get_user()["username"]

def prepare_pin_version(self, pin_dir_path, x, name: str | None, *args, **kwargs):
adaptor = _create_adaptor(x)

# RSC pin names can have form <user_name>/<name>, but this will try to
# create the object in a directory named <user_name>. So we grab just
# the <name> part.
short_name = name.split("/")[-1]

# TODO(compat): py pins always uses the short name, R pins uses w/e the
# user passed, but guessing people want the long name?
meta = super()._create_meta(pin_dir_path, x, short_name, *args, **kwargs)
meta = super()._create_meta(pin_dir_path, adaptor, short_name, *args, **kwargs)
meta.name = name

# copy in files needed by index.html ----------------------------------
Expand All @@ -1224,46 +1231,17 @@ def prepare_pin_version(self, pin_dir_path, x, name: str | None, *args, **kwargs
# render index.html ------------------------------------------------

all_files = [meta.file] if isinstance(meta.file, str) else meta.file
pin_files = ", ".join(f"""<a href="{x}">{x}</a>""" for x in all_files)
pin_files = ", ".join(f"""<a href="{file}">{file}</a>""" for file in all_files)

context = {
"date": meta.version.created.replace(microsecond=0),
"pin_name": self.path_to_pin(name),
"pin_files": pin_files,
"pin_metadata": meta,
"board_deparse": board_deparse(self),
"data_preview": adaptor.data_preview,
}

# data preview ----

# TODO: move out data_preview logic? Can we draw some limits here?
# note that the R library uses jsonlite::toJSON

import json

import pandas as pd

if isinstance(x, pd.DataFrame):
# TODO(compat) is 100 hard-coded?
# Note that we go df -> json -> dict, to take advantage of pandas type conversions
data = json.loads(x.head(100).to_json(orient="records"))
columns = [
{"name": [col], "label": [col], "align": ["left"], "type": [""]}
for col in x
]

# this reproduces R pins behavior, by omitting entries that would be null
data_no_nulls = [
{k: v for k, v in row.items() if v is not None} for row in data
]

context["data_preview"] = json.dumps(
{"data": data_no_nulls, "columns": columns}
)
else:
# TODO(compat): set display none in index.html
context["data_preview"] = json.dumps({})

# do not show r code if not round-trip friendly
if meta.type in ["joblib"]:
context["show_r_style"] = "display:none"
Expand Down
59 changes: 15 additions & 44 deletions pins/drivers.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
from collections.abc import Sequence
from pathlib import Path
from typing import Any

from pins._adaptors import _create_adaptor

from .config import PINS_ENV_INSECURE_READ, get_allow_pickle_read
from .errors import PinsInsecureReadError
Expand All @@ -13,15 +16,6 @@
REQUIRES_SINGLE_FILE = frozenset(["csv", "joblib"])


def _assert_is_pandas_df(x, file_type: str) -> None:
import pandas as pd

if not isinstance(x, pd.DataFrame):
raise NotImplementedError(
f"Currently only pandas.DataFrame can be saved as type {file_type!r}."
)


def load_path(filename: str, path_to_version, pin_type=None):
# file path creation ------------------------------------------------------
if pin_type == "table":
Expand Down Expand Up @@ -135,6 +129,8 @@ def save_data(
# as argument to board, and then type dispatchers for explicit cases
# of saving / loading objects different ways.

adaptor = _create_adaptor(obj)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could we allow obj to be either _Adaptor | Any (the typing is redundant, but maybe a helpful signal)? Then, if obj is not an adaptor, this line could create one.

I'm guessing keeping the original save_data behavior is useful for testing, but it'd be nice not to have to roundtrip by calling it on save_data(_Adaptor._d, ...) in this board method:

https://github.com/rstudio/pins-python/pull/298/files#diff-36792b1eedbe5453d2c6b58286ab65eed0c9286e94e2695a736edff37164e885R757

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see you've done that here:

d0fa9c9

I will:

  • Write a test case for this
  • Refactor to take advantage of this


if apply_suffix:
if pin_type == "file":
suffix = "".join(Path(obj).suffixes)
Expand All @@ -149,39 +145,22 @@ def save_data(
final_name = f"{fname}{suffix}"

if pin_type == "csv":
_assert_is_pandas_df(obj, file_type=type)

obj.to_csv(final_name, index=False)

adaptor.write_csv(final_name)
elif pin_type == "arrow":
# NOTE: R pins accepts the type arrow, and saves it as feather.
# we allow reading this type, but raise an error for writing.
_assert_is_pandas_df(obj, file_type=type)

obj.to_feather(final_name)

adaptor.write_feather(final_name)
elif pin_type == "feather":
_assert_is_pandas_df(obj, file_type=type)

raise NotImplementedError(
msg = (
'Saving data as type "feather" no longer supported. Use type "arrow" instead.'
)

raise NotImplementedError(msg)
elif pin_type == "parquet":
_assert_is_pandas_df(obj, file_type=type)

obj.to_parquet(final_name)

adaptor.write_parquet(final_name)
elif pin_type == "joblib":
import joblib

joblib.dump(obj, final_name)

adaptor.write_joblib(final_name)
elif pin_type == "json":
import json

json.dump(obj, open(final_name, "w"))

adaptor.write_json(final_name)
elif pin_type == "file":
import contextlib
import shutil
Expand All @@ -202,14 +181,6 @@ def save_data(
return final_name


def default_title(obj, name):
import pandas as pd

if isinstance(obj, pd.DataFrame):
# TODO(compat): title says CSV rather than data.frame
# see https://github.com/machow/pins-python/issues/5
shape_str = " x ".join(map(str, obj.shape))
return f"{name}: a pinned {shape_str} DataFrame"
else:
obj_name = type(obj).__qualname__
return f"{name}: a pinned {obj_name} object"
def default_title(obj: Any, name: str) -> str:
# Kept for backward compatibility only.
return _create_adaptor(obj).default_title(name)
Loading
Loading