Skip to content

Commit 6915e42

Browse files
authored
Merge pull request #66 from machow/feat-read-insecure-flag
feat!: require user to opt-in to read unsafe pins
2 parents 7fca499 + bd3fb4d commit 6915e42

File tree

8 files changed

+235
-42
lines changed

8 files changed

+235
-42
lines changed

docs/getting_started.Rmd

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,9 @@ While we’ll do our best to keep the automatically generated metadata consisten
130130
## 🚧 Versioning
131131

132132

133+
> ⚠️: Warning the examples in this section use joblib to read and write data. Joblib uses the pickle format, and **pickle files are not secure**. Only read pickle files you trust. In order to read pickle files, set the `allow_pickle_read=True` argument. See: https://docs.python.org/3/library/pickle.html.
134+
135+
133136
> ⚠️: versioning is not yet implemented. These docs are copied from R pins.
134137
135138
In many situations it's useful to version pins, so that writing to an existing pin does not replace the existing data, but instead adds a new copy.
@@ -154,9 +157,8 @@ The primary exception is `board_folder()` since that stores data on your compute
154157
Once you have turned versioning on, every `pin_write()` will create a new version:
155158

156159
```{python}
157-
# TODO: can save lists using joblib, but should warn about security
160+
board2 = board_temp(versioned = True, allow_pickle_read=True)
158161

159-
board2 = board_temp(versioned = True)
160162
board2.pin_write([1,2,3,4,5], name = "x", type = "joblib", title="TODO")
161163
board2.pin_write([1,2,3], name = "x", type = "joblib", title="TODO")
162164
board2.pin_write([1,2], name = "x", type = "joblib", title="TODO")

pins/boards.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@ def __init__(
5050
fs: IFileSystem,
5151
versioned=True,
5252
meta_factory=MetaFactory(),
53+
allow_pickle_read: "bool | None" = None,
5354
):
5455
self.board = str(board)
5556
self.fs = fs
@@ -59,6 +60,7 @@ def __init__(
5960
raise NotImplementedError()
6061

6162
self.versioned = versioned
63+
self.allow_pickle_read = allow_pickle_read
6264

6365
def pin_exists(self, name: str) -> bool:
6466
"""Determine if a pin exists.
@@ -201,7 +203,10 @@ def pin_read(self, name, version: Optional[str] = None, hash: Optional[str] = No
201203
pin_name = self.path_to_pin(name)
202204

203205
return load_data(
204-
meta, self.fs, self.construct_path([pin_name, meta.version.version])
206+
meta,
207+
self.fs,
208+
self.construct_path([pin_name, meta.version.version]),
209+
allow_pickle_read=self.allow_pickle_read,
205210
)
206211

207212
def pin_write(
@@ -603,7 +608,9 @@ def pin_download(self, name, version=None, hash=None) -> Sequence[str]:
603608
meta = self.pin_meta(name, version)
604609

605610
if isinstance(meta, MetaRaw):
606-
return load_data(meta, self.fs, None)
611+
return load_data(
612+
meta, self.fs, None, allow_pickle_read=self.allow_pickle_read
613+
)
607614

608615
raise NotImplementedError("TODO: allow download beyond MetaRaw.")
609616

pins/config.py

Lines changed: 21 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,11 +2,30 @@
22
import os
33

44
PINS_NAME = "pins-py"
5+
PINS_ENV_DATA_DIR = "PINS_DATA_DIR"
6+
PINS_ENV_CACHE_DIR = "PINS_CACHE_DIR"
7+
PINS_ENV_INSECURE_READ = "PINS_ALLOW_PICKLE_READ"
58

69

710
def get_data_dir():
8-
return os.environ.get("PINS_DATA_DIR", appdirs.user_data_dir(PINS_NAME))
11+
return os.environ.get(PINS_ENV_DATA_DIR, appdirs.user_data_dir(PINS_NAME))
912

1013

1114
def get_cache_dir():
12-
return os.environ.get("PINS_CACHE_DIR", appdirs.user_cache_dir(PINS_NAME))
15+
return os.environ.get(PINS_ENV_CACHE_DIR, appdirs.user_cache_dir(PINS_NAME))
16+
17+
18+
def get_allow_pickle_read(flag):
19+
if flag is None:
20+
env_var = os.environ.get(PINS_ENV_INSECURE_READ, "0")
21+
try:
22+
env_int = int(env_var)
23+
except ValueError:
24+
raise ValueError(
25+
f"{PINS_ENV_INSECURE_READ} must be '0' or '1', but was set to "
26+
f"{repr(env_var)}."
27+
)
28+
29+
flag = bool(env_int)
30+
31+
return flag

pins/constructors.py

Lines changed: 102 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ def board(
2121
path: str = "",
2222
versioned: bool = True,
2323
cache: "DEFAULT | None" = DEFAULT,
24+
allow_pickle_read=None,
2425
storage_options: "dict | None" = None,
2526
board_factory: "callable | BaseBoard | None" = None,
2627
):
@@ -39,6 +40,14 @@ def board(
3940
Whether to use a cache. By default, pins attempts to select the right cache
4041
directory, given your filesystem. If None is passed, then no cache will be
4142
used. You can set the cache using the PINS_CACHE_DIR environment variable.
43+
allow_pickle_read: optional, bool
44+
Whether to allow reading pins that use the pickle protocol. Pickles are unsafe,
45+
and can execute arbitrary code. Only allow reading pickles if you trust the
46+
board to be able to execute python code on your computer.
47+
48+
You can enable reading pickles by setting this to True, or by setting the
49+
environment variable PINS_ALLOW_PICKLE_READ. If both are set, this argument
50+
takes precedence.
4251
storage_options:
4352
Additional options passed to the underlying filesystem created by
4453
fsspec.filesystem.
@@ -72,27 +81,50 @@ def board(
7281

7382
# construct board ----
7483

84+
pickle_kwargs = {"allow_pickle_read": allow_pickle_read}
7585
# TODO: should use a registry or something
7686
if protocol == "rsc" and board_factory is None:
77-
board = BoardRsConnect(path, fs, versioned)
87+
board = BoardRsConnect(path, fs, versioned, **pickle_kwargs)
7888
elif board_factory is not None:
79-
board = board_factory(path, fs, versioned)
89+
board = board_factory(path, fs, versioned, **pickle_kwargs)
8090
else:
81-
board = BaseBoard(path, fs, versioned)
91+
board = BaseBoard(path, fs, versioned, **pickle_kwargs)
8292
return board
8393

8494

8595
# TODO(#31): change file boards to unversioned once implemented
8696

8797

88-
def board_folder(path, versioned=True):
89-
return board("file", path, versioned, cache=None)
98+
def board_folder(path: str, versioned=True, allow_pickle_read=None):
99+
"""Create a pins board inside a folder.
100+
101+
Parameters
102+
----------
103+
path:
104+
The folder that will hold the board.
105+
**kwargs:
106+
Passed to the pins.board function.
107+
"""
108+
109+
return board(
110+
"file", path, versioned, cache=None, allow_pickle_read=allow_pickle_read
111+
)
90112

91113

92-
def board_temp(versioned=True):
114+
def board_temp(versioned=True, allow_pickle_read=None):
115+
"""Create a pins board in a temporary directory.
116+
117+
Parameters
118+
----------
119+
**kwargs:
120+
Passed to the pins.board function.
121+
"""
122+
93123
tmp_dir = tempfile.TemporaryDirectory()
94124

95-
board_obj = board("file", tmp_dir.name, versioned, cache=None)
125+
board_obj = board(
126+
"file", tmp_dir.name, versioned, cache=None, allow_pickle_read=allow_pickle_read
127+
)
96128

97129
# TODO: this is necessary to ensure the temporary directory dir persists.
98130
# without maintaining a reference to it, it could be deleted after this
@@ -102,26 +134,36 @@ def board_temp(versioned=True):
102134
return board_obj
103135

104136

105-
def board_local(versioned=True):
137+
def board_local(versioned=True, allow_pickle_read=None):
138+
"""Create a board in a system data directory.
139+
140+
Parameters
141+
----------
142+
**kwargs:
143+
Passed to the pins.board function.
144+
"""
106145
path = get_data_dir()
107146

108-
return board("file", path, versioned, cache=None)
147+
return board(
148+
"file", path, versioned, cache=None, allow_pickle_read=allow_pickle_read
149+
)
109150

110151

111-
def board_github(org, repo, path="", versioned=True, cache=DEFAULT):
152+
def board_github(
153+
org, repo, path="", versioned=True, cache=DEFAULT, allow_pickle_read=None
154+
):
112155
"""Returns a github pin board.
113156
114157
Parameters
115158
----------
116-
path:
117-
TODO
118-
versioned:
119-
TODO
120159
org:
121160
Name of the github org (e.g. user account).
122161
repo:
123162
Name of the repo.
124-
163+
path:
164+
A subfolder in the github repo holding the board.
165+
**kwargs:
166+
Passed to the pins.board function.
125167
126168
Note
127169
----
@@ -144,12 +186,26 @@ def board_github(org, repo, path="", versioned=True, cache=DEFAULT):
144186
"""
145187

146188
return board(
147-
"github", path, versioned, cache, storage_options={"org": org, "repo": repo}
189+
"github",
190+
path,
191+
versioned,
192+
cache,
193+
allow_pickle_read=allow_pickle_read,
194+
storage_options={"org": org, "repo": repo},
148195
)
149196

150197

151-
def board_urls(path: str, pin_paths: dict, cache=DEFAULT):
152-
"""
198+
def board_urls(path: str, pin_paths: dict, cache=DEFAULT, allow_pickle_read=None):
199+
"""Create a board from individual urls.
200+
201+
Parameters
202+
----------
203+
path:
204+
A base url to prefix all individual pin urls with.
205+
pin_paths: Mapping
206+
A dictionary mapping pin name to pin url .
207+
**kwargs:
208+
Passed to the pins.board function.
153209
154210
Example
155211
-------
@@ -175,18 +231,29 @@ def board_urls(path: str, pin_paths: dict, cache=DEFAULT):
175231
else:
176232
raise NotImplementedError("Can't currently pass own cache object")
177233

178-
return BoardManual(path, fs, versioned=True, pin_paths=pin_paths)
234+
return BoardManual(
235+
path,
236+
fs,
237+
versioned=True,
238+
allow_pickle_read=allow_pickle_read,
239+
pin_paths=pin_paths,
240+
)
179241

180242

181-
def board_rsconnect(versioned=True, server_url=None, api_key=None, cache=DEFAULT):
182-
"""
243+
def board_rsconnect(
244+
versioned=True, server_url=None, api_key=None, cache=DEFAULT, allow_pickle_read=None
245+
):
246+
"""Create a board to read and write pins from an RStudio Connect instance.
183247
184248
Parameters
185249
----------
186250
server_url:
187-
TODO
251+
Url to the RStudio Connect server.
188252
api_key:
189-
TODO
253+
API key for server. If not specified, pins will attempt to read it from
254+
CONNECT_API_KEY environment variable.
255+
**kwargs:
256+
Passed to the pins.board function.
190257
"""
191258

192259
# TODO: api_key can be passed in to underlying RscApi, equiv to R's manual mode
@@ -196,9 +263,18 @@ def board_rsconnect(versioned=True, server_url=None, api_key=None, cache=DEFAULT
196263
server_url = os.environ.get("CONNECT_SERVER")
197264

198265
kwargs = dict(server_url=server_url, api_key=api_key)
199-
return board("rsc", "", versioned, storage_options=kwargs)
266+
return board("rsc", "", versioned, cache, allow_pickle_read, storage_options=kwargs)
267+
200268

269+
def board_s3(path, versioned=True, cache=DEFAULT, allow_pickle_read=None):
270+
"""Create a board to read and write pins from an AWS S3 bucket folder.
201271
202-
def board_s3(path, versioned=True):
272+
Parameters
273+
----------
274+
path:
275+
Path of form <bucket_name>/<optional>/<subdirectory>.
276+
**kwargs:
277+
Passed to the pins.board function.
278+
"""
203279
# TODO: user should be able to specify storage options here?
204-
return board("s3", path, versioned)
280+
return board("s3", path, versioned, cache, allow_pickle_read)

pins/drivers.py

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,14 +2,23 @@
22

33
from pathlib import Path
44

5+
from .config import get_allow_pickle_read, PINS_ENV_INSECURE_READ
56
from .meta import Meta
6-
7+
from .errors import PinsInsecureReadError
78

89
# TODO: move IFileSystem out of boards, to fix circular import
910
# from .boards import IFileSystem
1011

1112

12-
def load_data(meta: Meta, fs, path_to_version: "str | None" = None):
13+
UNSAFE_TYPES = frozenset(["joblib"])
14+
15+
16+
def load_data(
17+
meta: Meta,
18+
fs,
19+
path_to_version: "str | None" = None,
20+
allow_pickle_read: "bool | None" = None,
21+
):
1322
"""Return loaded data, based on meta type.
1423
Parameters
1524
----------
@@ -21,6 +30,15 @@ def load_data(meta: Meta, fs, path_to_version: "str | None" = None):
2130
A filepath used as the parent directory the data to-be-loaded lives in.
2231
"""
2332
# TODO: extandable loading with deferred importing
33+
if meta.type in UNSAFE_TYPES and not get_allow_pickle_read(allow_pickle_read):
34+
raise PinsInsecureReadError(
35+
f"Reading pin type {meta.type} involves reading a pickle file, so is NOT secure."
36+
f"Set the allow_pickle_read=True when creating the board, or the "
37+
f"{PINS_ENV_INSECURE_READ}=1 environment variable.\n"
38+
"See:\n"
39+
" * https://docs.python.org/3/library/pickle.html \n"
40+
" * https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations"
41+
)
2442

2543
# Check that only a single file name was given
2644
fnames = [meta.file] if isinstance(meta.file, str) else meta.file

pins/errors.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,3 +4,7 @@ class PinsError(Exception):
44

55
class PinsVersionError(PinsError):
66
pass
7+
8+
9+
class PinsInsecureReadError(PinsError):
10+
pass

0 commit comments

Comments
 (0)