Add nbytes property (#227)

TomNicholas · web-flow · commit eb4d6e0dbffb · 2024-12-17T22:12:56.000-07:00
* add nbytes property

* dataset accessor method

* test

* release notes

* add to API docs

* fix implementation so it still displays non-virtual total in xarray repr

* mention in documentation
diff --git a/docs/api.rst b/docs/api.rst
@@ -32,6 +32,16 @@ Serialization
     VirtualiZarrDatasetAccessor.to_zarr
     VirtualiZarrDatasetAccessor.to_icechunk
 
+Information
+-----------
+
+.. currentmodule:: virtualizarr.accessor
+.. autosummary::
+    :nosignatures:
+    :toctree: generated/
+
+    VirtualiZarrDatasetAccessor.nbytes
+
 Rewriting
 ---------
 
diff --git a/docs/releases.rst b/docs/releases.rst
@@ -9,6 +9,9 @@ v1.2.1 (unreleased)
 New Features
 ~~~~~~~~~~~~
 
+- Added a ``.nbytes`` accessor method which displays the bytes needed to hold the virtual references in memory.
+  (:issue:`167`, :pull:`227`) By `Tom Nicholas <https://github.com/TomNicholas>`_.
+
 Breaking changes
 ~~~~~~~~~~~~~~~~
 
diff --git a/docs/usage.md b/docs/usage.md
@@ -60,11 +60,25 @@ Attributes:
     title:        4x daily NMC reanalysis (1948)
 ```
 
-
 Generally a "virtual dataset" is any `xarray.Dataset` which wraps one or more {py:class}`ManifestArray <virtualizarr.manifests.ManifestArray>` objects.
 
 These particular {py:class}`ManifestArray <virtualizarr.manifests.ManifestArray>` objects are each a virtual reference to some data in the `air.nc` netCDF file, with the references stored in the form of "Chunk Manifests".
 
+As the manifest contains only addresses at which to find large binary chunks, the virtual dataset takes up far less space in memory than the original dataset does:
+
+```python
+ds.nbytes
+```
+```
+30975672
+```
+```python
+vds.virtualize.nbytes
+```
+```
+128
+```
+
 ```{important} Virtual datasets are not normal xarray datasets!
 
 Although the top-level type is still `xarray.Dataset`, they are intended only as an abstract representation of a set of data files, not as something you can do analysis with. If you try to load, view, or plot any data you will get a `NotImplementedError`. Virtual datasets only support a very limited subset of normal xarray operations, particularly functions and methods for concatenating, merging and extracting variables, as well as operations for renaming dimensions and variables.
diff --git a/virtualizarr/accessor.py b/virtualizarr/accessor.py
@@ -183,3 +183,21 @@ def rename_paths(
                 new_ds[var_name].data = data.rename_paths(new=new)
 
         return new_ds
+
+    @property
+    def nbytes(self) -> int:
+        """
+        Size required to hold these references in memory in bytes.
+
+        Note this is not the size of the referenced chunks if they were actually loaded into memory,
+        this is only the size of the pointers to the chunk locations.
+        If you were to load the data into memory it would be ~1e6x larger for 1MB chunks.
+
+        In-memory (loadable) variables are included in the total using xarray's normal ``.nbytes`` method.
+        """
+        return sum(
+            var.data.nbytes_virtual
+            if isinstance(var.data, ManifestArray)
+            else var.nbytes
+            for var in self.ds.variables.values()
+        )
diff --git a/virtualizarr/manifests/array.py b/virtualizarr/manifests/array.py
@@ -93,6 +93,18 @@ def size(self) -> int:
     def __repr__(self) -> str:
         return f"ManifestArray<shape={self.shape}, dtype={self.dtype}, chunks={self.chunks}>"
 
+    @property
+    def nbytes_virtual(self) -> int:
+        """
+        Size required to hold these references in memory in bytes.
+
+        Note this is not the size of the referenced array if it were actually loaded into memory,
+        this is only the size of the pointers to the chunk locations.
+        If you were to load the data into memory it would be ~1e6x larger for 1MB chunks.
+        """
+        # note: we don't name this method `.nbytes` as we don't want xarray's repr to use it
+        return self.manifest.nbytes
+
     def __array_function__(self, func, types, args, kwargs) -> Any:
         """
         Hook to teach this class what to do if np.concat etc. is called on it.
diff --git a/virtualizarr/manifests/manifest.py b/virtualizarr/manifests/manifest.py
@@ -357,6 +357,17 @@ def shape_chunk_grid(self) -> tuple[int, ...]:
     def __repr__(self) -> str:
         return f"ChunkManifest<shape={self.shape_chunk_grid}>"
 
+    @property
+    def nbytes(self) -> int:
+        """
+        Size required to hold these references in memory in bytes.
+
+        Note this is not the size of the referenced chunks if they were actually loaded into memory,
+        this is only the size of the pointers to the chunk locations.
+        If you were to load the data into memory it would be ~1e6x larger for 1MB chunks.
+        """
+        return self._paths.nbytes + self._offsets.nbytes + self._lengths.nbytes
+
     def __getitem__(self, key: ChunkKey) -> ChunkEntry:
         indices = split(key)
         path = self._paths[indices]
diff --git a/virtualizarr/tests/test_xarray.py b/virtualizarr/tests/test_xarray.py
@@ -3,6 +3,7 @@
 import numpy as np
 import pytest
 import xarray as xr
+from xarray import open_dataset
 
 from virtualizarr import open_virtual_dataset
 from virtualizarr.manifests import ChunkManifest, ManifestArray
@@ -310,3 +311,16 @@ def test_mixture_of_manifestarrays_and_numpy_arrays(
             == "s3://bucket/air.nc"
         )
         assert isinstance(renamed_vds["lat"].data, np.ndarray)
+
+
+@requires_kerchunk
+def test_nbytes(simple_netcdf4):
+    vds = open_virtual_dataset(simple_netcdf4)
+    assert vds.virtualize.nbytes == 32
+    assert vds.nbytes == 48
+
+    vds = open_virtual_dataset(simple_netcdf4, loadable_variables=["foo"])
+    assert vds.virtualize.nbytes == 48
+
+    ds = open_dataset(simple_netcdf4)
+    assert ds.virtualize.nbytes == ds.nbytes