huggingface · WeiwenLiu1221 · Oct 14, 2025 · Oct 14, 2025 · Oct 16, 2025 · Oct 17, 2025
diff --git a/src/lerobot/__init__.py b/src/lerobot/__init__.py
@@ -183,6 +183,7 @@
 available_cameras = [
     "opencv",
     "intelrealsense",
+    "zed",
 ]
 
 # lists all available motors from `lerobot/motors`

diff --git a/src/lerobot/cameras/__init__.py b/src/lerobot/cameras/__init__.py
@@ -15,3 +15,22 @@
 from .camera import Camera
 from .configs import CameraConfig, ColorMode, Cv2Rotation
 from .utils import make_cameras_from_configs
+
+from .camera import Camera
+from .configs import CameraConfig, ColorMode, Cv2Rotation
+from .utils import make_cameras_from_configs
+
+from .opencv.configuration_opencv import OpenCVCameraConfig
+from .realsense.configuration_realsense import RealSenseCameraConfig
+from .zed.configuration_zed import ZedCameraConfig
+
+__all__ = [
+    "Camera",
+    "CameraConfig",
+    "ColorMode",
+    "Cv2Rotation",
+    "make_cameras_from_configs",
+    "OpenCVCameraConfig",
+    "RealSenseCameraConfig",
+    "ZedCameraConfig",
+]
diff --git a/src/lerobot/cameras/camera.py b/src/lerobot/cameras/camera.py
@@ -89,28 +89,34 @@ def connect(self, warmup: bool = True) -> None:
         pass
 
     @abc.abstractmethod
-    def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
+    def read(self, color_mode: ColorMode | None = None) -> dict[str, np.ndarray]:
         """Capture and return a single frame from the camera.
 
         Args:
-            color_mode: Desired color mode for the output frame. If None,
-                        uses the camera's default color mode.
+            color_mode: Desired color mode for the output frame. If None, uses the camera's default color mode.
 
         Returns:
-            np.ndarray: Captured frame as a numpy array.
+            dict[str, np.ndarray]: Dictionary with modality keys and image data.
+                The keys are automatically determined based on array shape:
+                - 'image': For 2D arrays (H, W) - grayscale images
+                - 'image': For 3D arrays (H, W, 1) - grayscale images
+                - 'image': For 3D arrays (H, W, 3) - RGB/BGR images
+                - 'image': For 3D arrays (H, W, 4) - RGBA images
+                - 'depth': For depth maps (H, W)
+                Additional modality-specific keys may be provided by specific cameras.
         """
         pass
 
     @abc.abstractmethod
-    def async_read(self, timeout_ms: float = ...) -> np.ndarray:
+    def async_read(self, timeout_ms: float = ...) -> dict[str, np.ndarray]:
         """Asynchronously capture and return a single frame from the camera.
 
         Args:
-            timeout_ms: Maximum time to wait for a frame in milliseconds.
-                        Defaults to implementation-specific timeout.
+            timeout_ms: Maximum time to wait for a frame in milliseconds. Defaults to implementation-specific timeout.
 
         Returns:
-            np.ndarray: Captured frame as a numpy array.
+            dict[str, np.ndarray]: Dictionary mapping modality keys to image data.
+                See the read() method for detailed documentation on modality keys and image formats.
         """
         pass
 

diff --git a/src/lerobot/cameras/opencv/camera_opencv.py b/src/lerobot/cameras/opencv/camera_opencv.py
@@ -34,7 +34,7 @@
 from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError
 
 from ..camera import Camera
-from ..utils import get_cv2_backend, get_cv2_rotation
+from ..utils import get_cv2_backend, get_cv2_rotation, get_image_modality_key
 from .configuration_opencv import ColorMode, OpenCVCameraConfig
 
 # NOTE(Steven): The maximum opencv device index depends on your operating system. For instance,
@@ -289,7 +289,7 @@ def find_cameras() -> list[dict[str, Any]]:
 
         return found_cameras_info
 
-    def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
+    def read(self, color_mode: ColorMode | None = None) -> dict[str, np.ndarray]:
         """
         Reads a single frame synchronously from the camera.
 
@@ -301,11 +301,6 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
                 color mode (`self.color_mode`) for this read operation (e.g.,
                 request RGB even if default is BGR).
 
-        Returns:
-            np.ndarray: The captured frame as a NumPy array in the format
-                       (height, width, channels), using the specified or default
-                       color mode and applying any configured rotation.
-
         Raises:
             DeviceNotConnectedError: If the camera is not connected.
             RuntimeError: If reading the frame from the camera fails or if the
@@ -327,7 +322,8 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
         read_duration_ms = (time.perf_counter() - start_time) * 1e3
         logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
 
-        return processed_frame
+        image_modality_key = get_image_modality_key(image=processed_frame)
+        return {image_modality_key: processed_frame}
 
     def _postprocess_image(self, image: np.ndarray, color_mode: ColorMode | None = None) -> np.ndarray:
         """
@@ -419,7 +415,7 @@ def _stop_read_thread(self) -> None:
         self.thread = None
         self.stop_event = None
 
-    def async_read(self, timeout_ms: float = 200) -> np.ndarray:
+    def async_read(self, timeout_ms: float = 200) -> dict[str, np.ndarray]:
         """
         Reads the latest available frame asynchronously.
 
@@ -432,7 +428,7 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
                 to become available. Defaults to 200ms (0.2 seconds).
 
         Returns:
-            np.ndarray: The latest captured frame as a NumPy array in the format
+            dict[str, np.ndarray]: A map of the latest captured frames as a NumPy array in the format
                        (height, width, channels), processed according to configuration.
 
         Raises:

diff --git a/src/lerobot/cameras/reachy2_camera/reachy2_camera.py b/src/lerobot/cameras/reachy2_camera/reachy2_camera.py
@@ -23,6 +23,8 @@
 from threading import Event, Lock, Thread
 from typing import Any
 
+from ..utils import get_image_modality_key
+
 # Fix MSMF hardware transform compatibility for Windows before importing cv2
 if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" not in os.environ:
     os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0"
@@ -131,7 +133,7 @@ def find_cameras(ip_address: str = "localhost", port: int = 50065) -> list[dict[
         camera_manager.disconnect()
         return initialized_cameras
 
-    def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
+    def read(self, color_mode: ColorMode | None = None) -> dict[str, np.ndarray]:
         """
         Reads a single frame synchronously from the camera.
 
@@ -143,7 +145,7 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
                 request RGB even if default is BGR).
 
         Returns:
-            np.ndarray: The captured frame as a NumPy array in the format
+            dict[str, np.ndarray]: The captured frame as a NumPy array in the format
                        (height, width, channels), using the specified or default
                        color mode and applying any configured rotation.
         """
@@ -177,7 +179,8 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
         read_duration_ms = (time.perf_counter() - start_time) * 1e3
         logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
 
-        return frame
+        image_modality_key = get_image_modality_key(image=frame)
+        return {image_modality_key: frame}
 
     def _read_loop(self):
         """
@@ -226,7 +229,7 @@ def _stop_read_thread(self) -> None:
         self.thread = None
         self.stop_event = None
 
-    def async_read(self, timeout_ms: float = 200) -> np.ndarray:
+    def async_read(self, timeout_ms: float = 200) -> dict[str, np.ndarray]:
         """
         Reads the latest available frame asynchronously.
 
@@ -239,7 +242,7 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
                 to become available. Defaults to 200ms (0.2 seconds).
 
         Returns:
-            np.ndarray: The latest captured frame as a NumPy array in the format
+            dict[str, np.ndarray]: A map of the latest captured frames as a NumPy array in the format
                        (height, width, channels), processed according to configuration.
 
         Raises:
@@ -267,7 +270,8 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
         if frame is None:
             raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")
 
-        return frame
+        image_modality_key = get_image_modality_key(image=frame)
+        return {image_modality_key: frame}
 
     def disconnect(self):
         """

diff --git a/src/lerobot/cameras/realsense/camera_realsense.py b/src/lerobot/cameras/realsense/camera_realsense.py
@@ -33,7 +33,7 @@
 
 from ..camera import Camera
 from ..configs import ColorMode
-from ..utils import get_cv2_rotation
+from ..utils import get_cv2_rotation, get_image_modality_key
 from .configuration_realsense import RealSenseCameraConfig
 
 logger = logging.getLogger(__name__)
@@ -351,7 +351,7 @@ def read_depth(self, timeout_ms: int = 200) -> np.ndarray:
 
         return depth_map_processed
 
-    def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np.ndarray:
+    def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> dict[str, np.ndarray]:
         """
         Reads a single frame (color) synchronously from the camera.
 
@@ -362,7 +362,7 @@ def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np
             timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 200ms.
 
         Returns:
-            np.ndarray: The captured color frame as a NumPy array
+            dict[str, np.ndarray]: A map of the captured color frame as a NumPy array
               (height, width, channels), processed according to `color_mode` and rotation.
 
         Raises:
@@ -389,7 +389,8 @@ def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np
         read_duration_ms = (time.perf_counter() - start_time) * 1e3
         logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")
 
-        return color_image_processed
+        image_modality_key = get_image_modality_key(image=frame)
+        return {image_modality_key: frame}
 
     def _postprocess_image(
         self, image: np.ndarray, color_mode: ColorMode | None = None, depth_frame: bool = False
@@ -486,7 +487,7 @@ def _stop_read_thread(self):
         self.stop_event = None
 
     # NOTE(Steven): Missing implementation for depth for now
-    def async_read(self, timeout_ms: float = 200) -> np.ndarray:
+    def async_read(self, timeout_ms: float = 200) -> dict[str, np.ndarray]:
         """
         Reads the latest available frame data (color) asynchronously.
 
@@ -499,8 +500,8 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
                 to become available. Defaults to 200ms (0.2 seconds).
 
         Returns:
-            np.ndarray:
-            The latest captured frame data (color image), processed according to configuration.
+            dict[str, np.ndarray]:
+            A map of the latest captured frame data (color image), processed according to configuration.
 
         Raises:
             DeviceNotConnectedError: If the camera is not connected.
@@ -527,7 +528,8 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
         if frame is None:
             raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")
 
-        return frame
+        image_modality_key = get_image_modality_key(image=frame)
+        return {image_modality_key: frame}
 
     def disconnect(self):
         """

diff --git a/src/lerobot/cameras/utils.py b/src/lerobot/cameras/utils.py
@@ -17,6 +17,8 @@
 import platform
 from typing import cast
 
+import numpy as np
+
 from lerobot.utils.import_utils import make_device_from_device_class
 
 from .camera import Camera
@@ -43,6 +45,11 @@ def make_cameras_from_configs(camera_configs: dict[str, CameraConfig]) -> dict[s
 
             cameras[key] = Reachy2Camera(cfg)
 
+        elif cfg.type == "zed":
+            from .zed import ZedCamera
+
+            cameras[key] = ZedCamera(cfg)
+
         else:
             try:
                 cameras[key] = cast(Camera, make_device_from_device_class(cfg))
@@ -74,3 +81,47 @@ def get_cv2_backend() -> int:
     #     return cv2.CAP_AVFOUNDATION
     else:  # Linux and others
         return cv2.CAP_ANY
+
+
+def get_image_modality_key(image: np.ndarray, is_depth: bool = False) -> str:
+    """
+    Determine the modality key based on image array shape.
+
+    Args:
+        image: Image array from the camera. Should be a numpy array.
+        is_depth: If True, explicitly treat as depth modality regardless of shape.
+
+    Returns:
+        str: Modality key indicating the image type:
+            - 'depth': Depth maps
+            - 'gray': Grayscale images (H, W) or (H, W, 1)
+            - 'rgb': RGB images (H, W, 3)
+            - 'rgba': RGBA images (H, W, 4)
+            - 'unknown': For unsupported array shapes
+
+    Raises:
+        ValueError: If image is not a numpy array
+
+    Example:
+        >>> get_image_modality_key(np.zeros((480, 640)))  # 'gray'
+        >>> get_image_modality_key(np.zeros((480, 640, 3)))  # 'rgb'
+        >>> get_image_modality_key(np.zeros((480, 640)), is_depth=True)  # 'depth'
+    """
+    if not isinstance(image, np.ndarray):
+        raise ValueError(f"Expected numpy array, got {type(image)}")
+
+    if is_depth:
+        return "depth"
+
+    if len(image.shape) == 2:
+        return "gray"
+    elif len(image.shape) == 3:
+        channels = image.shape[2]
+        if channels == 1:
+            return "gray"
+        elif channels == 3:
+            return "rgb"
+        elif channels == 4:
+            return "rgba"
+
+    return "unknown"
diff --git a/src/lerobot/cameras/zed/__init__.py b/src/lerobot/cameras/zed/__init__.py
@@ -0,0 +1,2 @@
+from .camera_zed import ZedCamera
+from .configuration_zed import ZedCameraConfig
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		from .camera_zed import ZedCamera
		from .configuration_zed import ZedCameraConfig