feat: add filesystem keyword argument to cfdm.read (refs cf-python#931)

Copilot · bnlawrence · Copilot · commit 161256e23da4 · 2026-03-16T10:22:15.000Z
Co-authored-by: bnlawrence &lt;1792815+bnlawrence@users.noreply.github.com&gt;
diff --git a/cfdm/docstring/docstring.py b/cfdm/docstring/docstring.py
@@ -368,6 +368,21 @@
             *Example:*
               ``('netCDF4', 'h5netcdf-pyfive', 'netcdf_file',
               'h5netcdf-h5py')``""",
+    # read filesystem
+    "{{read filesystem: optional}}": """filesystem: optional
+            A pre-authenticated filesystem object (for example an
+            ``fsspec`` filesystem instance) to use for opening the
+            dataset. When provided, *datasets* values are treated as
+            paths understood by *filesystem*, and local string
+            pre-processing (tilde/variable expansion, globbing and
+            directory walking) is bypassed. The file is opened by
+            calling ``filesystem.open(dataset, "rb")``, which returns
+            a file-like object that is passed to the netCDF backend.
+
+            If `None` (the default) then the existing file-opening
+            logic is used.
+
+            .. versionadded:: (cfdm) NEXTVERSION""",
     # read  storage_options
     "{{read storage_options: `dict` or `None`, optional}}": """storage_options: `dict` or `None`, optional
             Pass parameters to the backend file system driver, such as
diff --git a/cfdm/read_write/netcdf/netcdfread.py b/cfdm/read_write/netcdf/netcdfread.py
@@ -544,35 +544,43 @@ def dataset_open(self, dataset, flatten=True, verbose=None):
 
         g["cdl_filename"] = cdl_filename
 
-        u = urisplit(dataset)
-        storage_options = self._get_storage_options(dataset, u)
-
-        if u.scheme == "s3":
+        filesystem = g.get("filesystem")
+        if filesystem is not None:
             # --------------------------------------------------------
-            # A file in an S3 object store
+            # A pre-authenticated filesystem was provided: open the
+            # dataset as a file-like object and pass it to the backend.
             # --------------------------------------------------------
-            from dask.base import tokenize
-
-            # Create an openable S3 file object
-            fs_key = tokenize(("s3", storage_options))
-            file_systems = g["file_systems"]
-            file_system = file_systems.get(fs_key)
-            if file_system is None:
-                # An S3 file system with these options does not exist,
-                # so create one.
-                from s3fs import S3FileSystem
-
-                file_system = S3FileSystem(**storage_options)
-                file_systems[fs_key] = file_system
-
-            # Reset 'dataset' to an s3fs.File object that can be
-            # passed to the netCDF backend
-            dataset = file_system.open(u.path[1:], "rb")
-
-            if is_log_level_detail(logger):
-                logger.detail(
-                    f"    S3: s3fs.S3FileSystem options: {storage_options}\n"
-                )  # pragma: no cover
+            dataset = filesystem.open(dataset, "rb")
+        else:
+            u = urisplit(dataset)
+            storage_options = self._get_storage_options(dataset, u)
+
+            if u.scheme == "s3":
+                # --------------------------------------------------------
+                # A file in an S3 object store
+                # --------------------------------------------------------
+                from dask.base import tokenize
+
+                # Create an openable S3 file object
+                fs_key = tokenize(("s3", storage_options))
+                file_systems = g["file_systems"]
+                file_system = file_systems.get(fs_key)
+                if file_system is None:
+                    # An S3 file system with these options does not exist,
+                    # so create one.
+                    from s3fs import S3FileSystem
+
+                    file_system = S3FileSystem(**storage_options)
+                    file_systems[fs_key] = file_system
+
+                # Reset 'dataset' to an s3fs.File object that can be
+                # passed to the netCDF backend
+                dataset = file_system.open(u.path[1:], "rb")
+
+                if is_log_level_detail(logger):
+                    logger.detail(
+                        f"    S3: s3fs.S3FileSystem options: {storage_options}\n"
+                    )  # pragma: no cover
 
         # Map backend names to dataset-open functions
         dataset_open_function = {
@@ -1015,6 +1023,7 @@ def read(
         warn_valid=False,
         domain=False,
         storage_options=None,
+        filesystem=None,
         _file_systems=None,
         netcdf_backend=None,
         cache=True,
@@ -1085,6 +1094,11 @@ def read(
 
                 .. versionadded:: (cfdm) 1.11.2.0
 
+            filesystem: optional
+                See `cfdm.read` for details.
+
+                .. versionadded:: (cfdm) NEXTVERSION
+
             netcdf_backend: `None` or `str`, optional
                 See `cfdm.read` for details.
 
@@ -1229,22 +1243,33 @@ def read(
         # Note that the `dataset_type` method is much faster than the
         # `dataset_open` method at returning for unrecognised types.
         # ------------------------------------------------------------
-        d_type = self.dataset_type(dataset, dataset_type)
-        if not d_type:
-            # Can't interpret the dataset as a recognised type, so
-            # either raise an exception or return an empty list.
-            if dataset_type is None:
-                raise DatasetTypeError(
-                    f"Can't interpret {dataset} as a dataset of one of the "
-                    f"valid types: {valid_dataset_types!r}"
-                )
+        if filesystem is not None:
+            # When a pre-authenticated filesystem is provided we cannot
+            # inspect the file locally, so we trust the caller.  Use
+            # the explicitly requested dataset_type if given, otherwise
+            # default to 'netCDF'.
+            if dataset_type is not None and "netCDF" not in dataset_type:
+                # The caller explicitly excluded netCDF; nothing to do.
+                return []
 
-            return []
+            d_type = "netCDF"
+        else:
+            d_type = self.dataset_type(dataset, dataset_type)
+            if not d_type:
+                # Can't interpret the dataset as a recognised type, so
+                # either raise an exception or return an empty list.
+                if dataset_type is None:
+                    raise DatasetTypeError(
+                        f"Can't interpret {dataset} as a dataset of one of the "
+                        f"valid types: {valid_dataset_types!r}"
+                    )
 
-        # Can interpret the dataset as a recognised type, but return
-        # an empty list if that type has been exlcuded.
-        if dataset_type is not None and d_type not in dataset_type:
-            return []
+                return []
+
+            # Can interpret the dataset as a recognised type, but return
+            # an empty list if that type has been exlcuded.
+            if dataset_type is not None and d_type not in dataset_type:
+                return []
 
         # ------------------------------------------------------------
         # Parse the 'netcdf_backend' keyword parameter
@@ -1532,6 +1557,8 @@ def read(
             "file_system_storage_options": {},
             # Cached s3fs.S3FileSystem objects
             "file_systems": _file_systems,
+            # Pre-authenticated filesystem object (e.g. fsspec)
+            "filesystem": filesystem,
             # --------------------------------------------------------
             # Array element caching
             # --------------------------------------------------------
diff --git a/cfdm/read_write/read.py b/cfdm/read_write/read.py
@@ -154,6 +154,10 @@ class read(ReadWrite):
 
             .. versionadded:: (cfdm) 1.11.2.0
 
+        {{read filesystem: optional}}
+
+            .. versionadded:: (cfdm) NEXTVERSION
+
         {{read storage_options: `dict` or `None`, optional}}
 
             .. versionadded:: (cfdm) 1.11.2.0
@@ -243,6 +247,7 @@ def __new__(
         domain=False,
         netcdf_backend=None,
         storage_options=None,
+        filesystem=None,
         cache=True,
         dask_chunks="storage-aligned",
         store_dataset_chunks=True,
@@ -344,6 +349,14 @@ def _datasets(self):
         followlinks = kwargs.get("followlinks", False)
 
         datasets = self._flat(kwargs["datasets"])
+
+        # If a filesystem object is provided, treat each dataset path
+        # as-is (no local glob/walk/expansion) and yield directly.
+        if kwargs.get("filesystem") is not None:
+            for dataset1 in datasets:
+                yield dataset1
+            return
+
         if kwargs["cdl_string"]:
             # Return CDL strings as they are
             for dataset1 in datasets:
@@ -580,6 +593,7 @@ def _read(self, dataset):
                         "unpack",
                         "domain",
                         "storage_options",
+                        "filesystem",
                         "netcdf_backend",
                         "cache",
                         "dask_chunks",
diff --git a/cfdm/test/test_read_write.py b/cfdm/test/test_read_write.py
@@ -1541,6 +1541,73 @@ def test_read_zarr_and_non_zarr(self):
         self.assertEqual(len(f), 5)
 
 
+    def test_read_filesystem(self):
+        """Test cfdm.read with a pre-authenticated filesystem object."""
+        import io
+        from unittest.mock import MagicMock
+
+        f = self.f0
+        cfdm.write(f, tmpfile)
+
+        # ------------------------------------------------------------------
+        # Build a mock filesystem whose .open() returns the real file bytes
+        # so that the netCDF backend can parse them normally.
+        # ------------------------------------------------------------------
+        with open(tmpfile, "rb") as fh:
+            file_bytes = fh.read()
+
+        open_calls = []
+
+        def fake_open(path, mode="rb"):
+            open_calls.append((path, mode))
+            return io.BytesIO(file_bytes)
+
+        mock_fs = MagicMock()
+        mock_fs.open.side_effect = fake_open
+
+        # Read using the mock filesystem
+        result = cfdm.read(tmpfile, filesystem=mock_fs)
+
+        # filesystem.open() must have been called with the dataset path
+        self.assertTrue(len(open_calls) > 0, "filesystem.open was not called")
+        self.assertEqual(open_calls[0][0], tmpfile)
+        self.assertEqual(open_calls[0][1], "rb")
+
+        # The read result must match what we get without filesystem
+        expected = cfdm.read(tmpfile)
+        self.assertEqual(len(result), len(expected))
+        self.assertTrue(result[0].equals(expected[0]))
+
+    def test_read_filesystem_bypasses_glob(self):
+        """Test that filesystem=... bypasses local glob expansion."""
+        import io
+        from unittest.mock import MagicMock
+
+        f = self.f0
+        cfdm.write(f, tmpfile)
+
+        with open(tmpfile, "rb") as fh:
+            file_bytes = fh.read()
+
+        yielded_datasets = []
+
+        def fake_open(path, mode="rb"):
+            yielded_datasets.append(path)
+            return io.BytesIO(file_bytes)
+
+        mock_fs = MagicMock()
+        mock_fs.open.side_effect = fake_open
+
+        # Pass a glob-like pattern as the dataset.  Without filesystem,
+        # this would expand to matching local files.  With filesystem, it
+        # must be passed through unchanged.
+        pattern = "/some/remote/path/*.nc"
+        cfdm.read(pattern, filesystem=mock_fs)
+
+        # The pattern must have been forwarded verbatim to filesystem.open()
+        self.assertEqual(yielded_datasets, [pattern])
+
+
 if __name__ == "__main__":
     print("Run date:", datetime.datetime.now())
     cfdm.environment()