Skip to content

Commit 85e8cc3

Browse files
Implement force_identicial_write (#260)
* Implement `force_identicial_write` * Reword the comments * Add docs for `force_identical_write` to `get_started.qmd` * Clarify wording. * Add sleep call to help docs build * Use force_identical_write in rsc test * remove duplicate tests --------- Co-authored-by: isabelizimm <[email protected]>
1 parent 240768b commit 85e8cc3

File tree

3 files changed

+152
-10
lines changed

3 files changed

+152
-10
lines changed

docs/get_started.qmd

Lines changed: 20 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,8 @@ jupyter: python3
44
---
55

66
```{python}
7-
#| include: false
7+
# | include: false
8+
import time
89
import pandas as pd
910
pd.options.display.max_rows = 25
1011
```
@@ -126,7 +127,7 @@ While we’ll do our best to keep the automatically generated metadata consisten
126127

127128
## Versioning
128129

129-
Every [](`~pins.boards.BaseBoard.pin_write`) will create a new version:
130+
By default, calls to [](`~pins.boards.BaseBoard.pin_write`) will usually create a new version:
130131

131132
```{python}
132133
board2 = board_temp()
@@ -136,6 +137,23 @@ board2.pin_write([1,2], name = "x", type = "json")
136137
board2.pin_versions("x")
137138
```
138139

140+
The only exception is if the data is identical with the most recent version (compared via file hash):
141+
142+
```{python}
143+
board2.pin_write([1], name = "x", type = "json")
144+
time.sleep(1.1) # later, let's try and write a new version of the same data...
145+
board2.pin_write([1], name = "x", type = "json")
146+
board2.pin_versions("x")
147+
```
148+
149+
150+
However you can opt-out of this behaviour with `force_identical_write=True`:
151+
```{python}
152+
time.sleep(1.1) # try again...
153+
board2.pin_write([1], name = "x", type = "json", force_identical_write=True)
154+
board2.pin_versions("x")
155+
```
156+
139157
By default, [](`~pins.boards.BaseBoard.pin_read`) will return the most recent version:
140158

141159
```{python}

pins/boards.py

Lines changed: 45 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -225,6 +225,8 @@ def _pin_store(
225225
metadata: Mapping | None = None,
226226
versioned: bool | None = None,
227227
created: datetime | None = None,
228+
*,
229+
force_identical_write: bool = False,
228230
) -> Meta:
229231
if type == "feather":
230232
warn_deprecated(
@@ -248,8 +250,16 @@ def _pin_store(
248250

249251
pin_name = self.path_to_pin(name)
250252

253+
# Pre-emptively fetch the most recent pin's meta if it exists - this is used
254+
# for the force_identical_write check
255+
abort_if_identical = not force_identical_write and self.pin_exists(name)
256+
if abort_if_identical:
257+
last_meta = self.pin_meta(name)
258+
251259
with tempfile.TemporaryDirectory() as tmp_dir:
252-
# create all pin data (e.g. data.txt, save object)
260+
# create all pin data (e.g. data.txt, save object) to get the metadata.
261+
# For unversioned boards, this also will delete the most recent pin version,
262+
# ready for it to be replaced with a new one.
253263
meta = self.prepare_pin_version(
254264
tmp_dir,
255265
x,
@@ -263,6 +273,18 @@ def _pin_store(
263273
object_name=object_name,
264274
)
265275

276+
# force_identical_write check
277+
if abort_if_identical:
278+
last_hash = last_meta.pin_hash
279+
280+
if last_hash == meta.pin_hash:
281+
msg = (
282+
f'The hash of pin "{name}" has not changed. Your pin will not '
283+
f"be stored.",
284+
)
285+
inform(log=_log, msg=msg)
286+
return last_meta
287+
266288
# move pin to destination ----
267289
# create pin version folder
268290
dst_pin_path = self.construct_path([pin_name])
@@ -310,6 +332,8 @@ def pin_write(
310332
metadata: Mapping | None = None,
311333
versioned: bool | None = None,
312334
created: datetime | None = None,
335+
*,
336+
force_identical_write: bool = False,
313337
) -> Meta:
314338
"""Write a pin object to the board.
315339
@@ -336,6 +360,17 @@ def pin_write(
336360
created:
337361
A date to store in the Meta.created field. This field may be used as
338362
part of the pin version name.
363+
force_identical_write:
364+
Store the pin even if the pin contents are identical to the last version
365+
(compared using the hash). Only the pin contents are compared, not the pin
366+
metadata. Defaults to False.
367+
368+
Returns
369+
-------
370+
Meta:
371+
Metadata about the stored pin. If `force_identical_write` is False and the
372+
pin contents are identical to the last version, the last version's metadata
373+
is returned.
339374
"""
340375

341376
if type == "file":
@@ -345,7 +380,15 @@ def pin_write(
345380
)
346381

347382
return self._pin_store(
348-
x, name, type, title, description, metadata, versioned, created
383+
x,
384+
name,
385+
type,
386+
title,
387+
description,
388+
metadata,
389+
versioned,
390+
created,
391+
force_identical_write=force_identical_write,
349392
)
350393

351394
def pin_download(self, name, version=None, hash=None) -> Sequence[str]:

pins/tests/test_boards.py

Lines changed: 87 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -136,6 +136,40 @@ def test_board_pin_write_file_raises_error(board, tmp_path):
136136
board.pin_write(path, "cool_pin", type="file")
137137

138138

139+
@pytest.mark.parametrize("force_identical_write", [True, False])
140+
def test_board_pin_write_force_identical_write_pincount(board, force_identical_write):
141+
df = pd.DataFrame({"x": [1, 2, 3]})
142+
143+
# 1min ago to avoid name collision
144+
one_min_ago = datetime.now() - timedelta(minutes=1)
145+
board.pin_write(df, "cool_pin", type="csv", created=one_min_ago)
146+
board.pin_write(
147+
df, "cool_pin", type="csv", force_identical_write=force_identical_write
148+
)
149+
versions = board.pin_versions("cool_pin")
150+
if force_identical_write:
151+
assert len(versions) == 2
152+
else:
153+
assert len(versions) == 1
154+
155+
156+
def test_board_pin_write_force_identical_write_msg(
157+
board, capfd: pytest.CaptureFixture[str]
158+
):
159+
df = pd.DataFrame({"x": [1, 2, 3]})
160+
161+
# 1min ago to avoid name collision
162+
one_min_ago = datetime.now() - timedelta(minutes=1)
163+
board.pin_write(df, "cool_pin", type="csv", created=one_min_ago)
164+
board.pin_write(df, "cool_pin", type="csv")
165+
versions = board.pin_versions("cool_pin")
166+
167+
_, err = capfd.readouterr()
168+
msg = 'The hash of pin "cool_pin" has not changed. Your pin will not be stored.'
169+
assert msg in err
170+
assert len(versions) == 1
171+
172+
139173
def test_board_pin_download(board_with_cache, tmp_path):
140174
# create and save data
141175
df = pd.DataFrame({"x": [1, 2, 3]})
@@ -309,6 +343,32 @@ def test_board_pin_read_insecure_succeed_board_flag(board):
309343
# pin_write with unversioned boards ===========================================
310344

311345

346+
@pytest.mark.parametrize("versioned", [None, False])
347+
def test_board_unversioned_pin_write_unversioned_force_identical_write(
348+
versioned, board_unversioned
349+
):
350+
# 1min ago to avoid name collision
351+
one_min_ago = datetime.now() - timedelta(minutes=1)
352+
board_unversioned.pin_write(
353+
{"a": 1},
354+
"test_pin",
355+
type="json",
356+
versioned=versioned,
357+
created=one_min_ago,
358+
force_identical_write=True,
359+
)
360+
board_unversioned.pin_write(
361+
{"a": 2},
362+
"test_pin",
363+
type="json",
364+
versioned=versioned,
365+
force_identical_write=True,
366+
)
367+
368+
assert len(board_unversioned.pin_versions("test_pin")) == 1
369+
assert board_unversioned.pin_read("test_pin") == {"a": 2}
370+
371+
312372
@pytest.mark.parametrize("versioned", [None, False])
313373
def test_board_unversioned_pin_write_unversioned(versioned, board_unversioned):
314374
board_unversioned.pin_write({"a": 1}, "test_pin", type="json", versioned=versioned)
@@ -346,9 +406,14 @@ def pin_name():
346406

347407
@pytest.fixture
348408
def pin_del(board, df, pin_name):
349-
meta_old = board.pin_write(df, pin_name, type="csv", title="some title")
350-
sleep(1)
351-
meta_new = board.pin_write(df, pin_name, type="csv", title="some title")
409+
# 1min ago to avoid name collision
410+
one_min_ago = datetime.now() - timedelta(minutes=1)
411+
meta_old = board.pin_write(
412+
df, pin_name, type="csv", title="some title", created=one_min_ago
413+
)
414+
meta_new = board.pin_write(
415+
df, pin_name, type="csv", title="some title", force_identical_write=True
416+
)
352417

353418
assert len(board.pin_versions(pin_name)) == 2
354419
assert meta_old.version.version != meta_new.version.version
@@ -363,8 +428,22 @@ def pin_prune(board, df, pin_name):
363428
two_days_ago = today - timedelta(days=2, minutes=1)
364429

365430
board.pin_write(df, pin_name, type="csv", title="some title", created=today)
366-
board.pin_write(df, pin_name, type="csv", title="some title", created=day_ago)
367-
board.pin_write(df, pin_name, type="csv", title="some title", created=two_days_ago)
431+
board.pin_write(
432+
df,
433+
pin_name,
434+
type="csv",
435+
title="some title",
436+
created=day_ago,
437+
force_identical_write=True,
438+
)
439+
board.pin_write(
440+
df,
441+
pin_name,
442+
type="csv",
443+
title="some title",
444+
created=two_days_ago,
445+
force_identical_write=True,
446+
)
368447

369448
versions = board.pin_versions(pin_name, as_df=False)
370449
assert len(versions) == 3
@@ -573,7 +652,9 @@ def test_board_pin_search_admin_user(df, board_short, fs_admin): # noqa
573652
@pytest.mark.fs_rsc
574653
def test_board_rsc_pin_write_title_update(df, board_short):
575654
board_short.pin_write(df, "susan/some_df", type="csv", title="title a")
576-
board_short.pin_write(df, "susan/some_df", type="csv", title="title b")
655+
board_short.pin_write(
656+
df, "susan/some_df", type="csv", title="title b", force_identical_write=True
657+
)
577658

578659
content = board_short.fs.info("susan/some_df")
579660
assert content["title"] == "title b"

0 commit comments

Comments
 (0)