zarr-developers
diff --git a/‎docs/release.rst‎
Lines changed: 3 additions & 0 deletions b/‎docs/release.rst‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎zarr/core.py‎
Lines changed: 72 additions & 13 deletions b/‎zarr/core.py‎
Lines changed: 72 additions & 13 deletions
diff --git a/‎zarr/creation.py‎
Lines changed: 22 additions & 5 deletions b/‎zarr/creation.py‎
Lines changed: 22 additions & 5 deletions
diff --git a/‎zarr/storage.py‎
Lines changed: 9 additions & 0 deletions b/‎zarr/storage.py‎
Lines changed: 9 additions & 0 deletions
@@ -12,6 +12,9 @@ Enhancements
 * array indexing with [] (getitem and setitem) now supports fancy indexing.
   By :user:`Juan Nunez-Iglesias <jni>`; :issue:`725`.
 
+* write_empty_chunks=False deletes chunks consisting of only fill_value.
+  By :user:`Davis Bennett <d-v-b>`; :issue:`738`.
+
 .. _release_2.10.2:
 
 2.10.2
 
@@ -32,6 +32,7 @@
 from zarr.meta import decode_array_metadata, encode_array_metadata
 from zarr.storage import array_meta_key, attrs_key, getsize, listdir
 from zarr.util import (
+    all_equal,
     InfoReporter,
     check_array_shape,
     human_readable_size,
@@ -75,6 +76,14 @@ class Array:
         If True and while the chunk_store is a FSStore and the compresion used
         is Blosc, when getting data from the array chunks will be partially
         read and decompressed when possible.
+    write_empty_chunks : bool, optional
+        If True (default), all chunks will be stored regardless of their
+        contents. If False, each chunk is compared to the array's fill
+        value prior to storing. If a chunk is uniformly equal to the fill
+        value, then that chunk is not be stored, and the store entry for
+        that chunk's key is deleted. This setting enables sparser storage,
+        as only chunks with non-fill-value data are stored, at the expense
+        of overhead associated with checking the data of each chunk.
 
         .. versionadded:: 2.7
 
@@ -107,6 +116,7 @@ class Array:
     info
     vindex
     oindex
+    write_empty_chunks
 
     Methods
     -------
@@ -139,6 +149,7 @@ def __init__(
         cache_metadata=True,
         cache_attrs=True,
         partial_decompress=False,
+        write_empty_chunks=True,
     ):
         # N.B., expect at this point store is fully initialized with all
         # configuration metadata fully specified and normalized
@@ -155,6 +166,7 @@ def __init__(
         self._cache_metadata = cache_metadata
         self._is_view = False
         self._partial_decompress = partial_decompress
+        self._write_empty_chunks = write_empty_chunks
 
         # initialize metadata
         self._load_metadata()
@@ -455,6 +467,13 @@ def vindex(self):
         :func:`set_mask_selection` for documentation and examples."""
         return self._vindex
 
+    @property
+    def write_empty_chunks(self) -> bool:
+        """A Boolean, True if chunks composed of the array's fill value
+        will be stored. If False, such chunks will not be stored.
+        """
+        return self._write_empty_chunks
+
     def __eq__(self, other):
         return (
             isinstance(other, Array) and
@@ -1626,9 +1645,18 @@ def _set_basic_selection_zd(self, selection, value, fields=None):
         else:
             chunk[selection] = value
 
-        # encode and store
-        cdata = self._encode_chunk(chunk)
-        self.chunk_store[ckey] = cdata
+        # remove chunk if write_empty_chunks is false and it only contains the fill value
+        if (not self.write_empty_chunks) and all_equal(self.fill_value, chunk):
+            try:
+                del self.chunk_store[ckey]
+                return
+            except Exception:  # pragma: no cover
+                # deleting failed, fallback to overwriting
+                pass
+        else:
+            # encode and store
+            cdata = self._encode_chunk(chunk)
+            self.chunk_store[ckey] = cdata
 
     def _set_basic_selection_nd(self, selection, value, fields=None):
         # implementation of __setitem__ for array with at least one dimension
@@ -1896,11 +1924,38 @@ def _chunk_getitems(self, lchunk_coords, lchunk_selection, out, lout_selection,
                     out[out_select] = fill_value
 
     def _chunk_setitems(self, lchunk_coords, lchunk_selection, values, fields=None):
-        ckeys = [self._chunk_key(co) for co in lchunk_coords]
-        cdatas = [self._process_for_setitem(key, sel, val, fields=fields)
-                  for key, sel, val in zip(ckeys, lchunk_selection, values)]
-        values = {k: v for k, v in zip(ckeys, cdatas)}
-        self.chunk_store.setitems(values)
+        ckeys = map(self._chunk_key, lchunk_coords)
+        cdatas = {key: self._process_for_setitem(key, sel, val, fields=fields)
+                  for key, sel, val in zip(ckeys, lchunk_selection, values)}
+        to_store = {}
+        if not self.write_empty_chunks:
+            empty_chunks = {k: v for k, v in cdatas.items() if all_equal(self.fill_value, v)}
+            self._chunk_delitems(empty_chunks.keys())
+            nonempty_keys = cdatas.keys() - empty_chunks.keys()
+            to_store = {k: self._encode_chunk(cdatas[k]) for k in nonempty_keys}
+        else:
+            to_store = {k: self._encode_chunk(v) for k, v in cdatas.items()}
+        self.chunk_store.setitems(to_store)
+
+    def _chunk_delitems(self, ckeys):
+        if hasattr(self.store, "delitems"):
+            self.store.delitems(ckeys)
+        else:  # pragma: no cover
+            # exempting this branch from coverage as there are no extant stores
+            # that will trigger this condition, but it's possible that they
+            # will be developed in the future.
+            tuple(map(self._chunk_delitem, ckeys))
+        return None
+
+    def _chunk_delitem(self, ckey):
+        """
+        Attempt to delete the value associated with ckey.
+        """
+        try:
+            del self.chunk_store[ckey]
+            return
+        except KeyError:
+            return
 
     def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None):
         """Replace part or whole of a chunk.
@@ -1931,8 +1986,12 @@ def _chunk_setitem(self, chunk_coords, chunk_selection, value, fields=None):
     def _chunk_setitem_nosync(self, chunk_coords, chunk_selection, value, fields=None):
         ckey = self._chunk_key(chunk_coords)
         cdata = self._process_for_setitem(ckey, chunk_selection, value, fields=fields)
-        # store
-        self.chunk_store[ckey] = cdata
+
+        # attempt to delete chunk if it only contains the fill value
+        if (not self.write_empty_chunks) and all_equal(self.fill_value, cdata):
+            self._chunk_delitem(ckey)
+        else:
+            self.chunk_store[ckey] = self._encode_chunk(cdata)
 
     def _process_for_setitem(self, ckey, chunk_selection, value, fields=None):
         if is_total_slice(chunk_selection, self._chunks) and not fields:
@@ -1988,8 +2047,7 @@ def _process_for_setitem(self, ckey, chunk_selection, value, fields=None):
             else:
                 chunk[chunk_selection] = value
 
-        # encode chunk
-        return self._encode_chunk(chunk)
+        return chunk
 
     def _chunk_key(self, chunk_coords):
         return self._key_prefix + self._dimension_separator.join(map(str, chunk_coords))
@@ -2209,7 +2267,8 @@ def hexdigest(self, hashname="sha1"):
 
     def __getstate__(self):
         return (self._store, self._path, self._read_only, self._chunk_store,
-                self._synchronizer, self._cache_metadata, self._attrs.cache)
+                self._synchronizer, self._cache_metadata, self._attrs.cache,
+                self._partial_decompress, self._write_empty_chunks)
 
     def __setstate__(self, state):
         self.__init__(*state)
 
@@ -21,7 +21,7 @@ def create(shape, chunks=True, dtype=None, compressor='default',
            fill_value=0, order='C', store=None, synchronizer=None,
            overwrite=False, path=None, chunk_store=None, filters=None,
            cache_metadata=True, cache_attrs=True, read_only=False,
-           object_codec=None, dimension_separator=None, **kwargs):
+           object_codec=None, dimension_separator=None, write_empty_chunks=True, **kwargs):
     """Create an array.
 
     Parameters
@@ -71,6 +71,15 @@ def create(shape, chunks=True, dtype=None, compressor='default',
     dimension_separator : {'.', '/'}, optional
         Separator placed between the dimensions of a chunk.
         .. versionadded:: 2.8
+    write_empty_chunks : bool, optional
+        If True (default), all chunks will be stored regardless of their
+        contents. If False, each chunk is compared to the array's fill
+        value prior to storing. If a chunk is uniformly equal to the fill
+        value, then that chunk is not be stored, and the store entry for
+        that chunk's key is deleted. This setting enables sparser storage,
+        as only chunks with non-fill-value data are stored, at the expense
+        of overhead associated with checking the data of each chunk.
+
 
     Returns
     -------
@@ -142,7 +151,8 @@ def create(shape, chunks=True, dtype=None, compressor='default',
 
     # instantiate array
     z = Array(store, path=path, chunk_store=chunk_store, synchronizer=synchronizer,
-              cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only)
+              cache_metadata=cache_metadata, cache_attrs=cache_attrs, read_only=read_only,
+              write_empty_chunks=write_empty_chunks)
 
     return z
 
@@ -400,6 +410,7 @@ def open_array(
     chunk_store=None,
     storage_options=None,
     partial_decompress=False,
+    write_empty_chunks=True,
     **kwargs
 ):
     """Open an array using file-mode-like semantics.
@@ -454,8 +465,14 @@ def open_array(
         If True and while the chunk_store is a FSStore and the compresion used
         is Blosc, when getting data from the array chunks will be partially
         read and decompressed when possible.
-
-        .. versionadded:: 2.7
+    write_empty_chunks : bool, optional
+        If True (default), all chunks will be stored regardless of their
+        contents. If False, each chunk is compared to the array's fill
+        value prior to storing. If a chunk is uniformly equal to the fill
+        value, then that chunk is not be stored, and the store entry for
+        that chunk's key is deleted. This setting enables sparser storage,
+        as only chunks with non-fill-value data are stored, at the expense
+        of overhead associated with checking the data of each chunk.
 
     Returns
     -------
@@ -545,7 +562,7 @@ def open_array(
     # instantiate array
     z = Array(store, read_only=read_only, synchronizer=synchronizer,
               cache_metadata=cache_metadata, cache_attrs=cache_attrs, path=path,
-              chunk_store=chunk_store)
+              chunk_store=chunk_store, write_empty_chunks=write_empty_chunks)
 
     return z
 
 
@@ -1154,6 +1154,15 @@ def __delitem__(self, key):
         else:
             del self.map[key]
 
+    def delitems(self, keys):
+        if self.mode == 'r':
+            raise ReadOnlyError()
+        # only remove the keys that exist in the store
+        nkeys = [self._normalize_key(key) for key in keys if key in self]
+        # rm errors if you pass an empty collection
+        if len(nkeys) > 0:
+            self.map.delitems(nkeys)
+
     def __contains__(self, key):
         key = self._normalize_key(key)
         return key in self.map