Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions src/lerobot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,7 @@
available_cameras = [
"opencv",
"intelrealsense",
"zed",
]

# lists all available motors from `lerobot/motors`
Expand Down
19 changes: 19 additions & 0 deletions src/lerobot/cameras/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,22 @@
from .camera import Camera
from .configs import CameraConfig, ColorMode, Cv2Rotation
from .utils import make_cameras_from_configs

from .camera import Camera
from .configs import CameraConfig, ColorMode, Cv2Rotation
from .utils import make_cameras_from_configs

from .opencv.configuration_opencv import OpenCVCameraConfig
from .realsense.configuration_realsense import RealSenseCameraConfig
from .zed.configuration_zed import ZedCameraConfig

__all__ = [
"Camera",
"CameraConfig",
"ColorMode",
"Cv2Rotation",
"make_cameras_from_configs",
"OpenCVCameraConfig",
"RealSenseCameraConfig",
"ZedCameraConfig",
]
22 changes: 14 additions & 8 deletions src/lerobot/cameras/camera.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,28 +89,34 @@ def connect(self, warmup: bool = True) -> None:
pass

@abc.abstractmethod
def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
def read(self, color_mode: ColorMode | None = None) -> dict[str, np.ndarray]:
"""Capture and return a single frame from the camera.

Args:
color_mode: Desired color mode for the output frame. If None,
uses the camera's default color mode.
color_mode: Desired color mode for the output frame. If None, uses the camera's default color mode.

Returns:
np.ndarray: Captured frame as a numpy array.
dict[str, np.ndarray]: Dictionary with modality keys and image data.
The keys are automatically determined based on array shape:
- 'image': For 2D arrays (H, W) - grayscale images
- 'image': For 3D arrays (H, W, 1) - grayscale images
- 'image': For 3D arrays (H, W, 3) - RGB/BGR images
- 'image': For 3D arrays (H, W, 4) - RGBA images
- 'depth': For depth maps (H, W)
Additional modality-specific keys may be provided by specific cameras.
"""
pass

@abc.abstractmethod
def async_read(self, timeout_ms: float = ...) -> np.ndarray:
def async_read(self, timeout_ms: float = ...) -> dict[str, np.ndarray]:
"""Asynchronously capture and return a single frame from the camera.

Args:
timeout_ms: Maximum time to wait for a frame in milliseconds.
Defaults to implementation-specific timeout.
timeout_ms: Maximum time to wait for a frame in milliseconds. Defaults to implementation-specific timeout.

Returns:
np.ndarray: Captured frame as a numpy array.
dict[str, np.ndarray]: Dictionary mapping modality keys to image data.
See the read() method for detailed documentation on modality keys and image formats.
"""
pass

Expand Down
16 changes: 6 additions & 10 deletions src/lerobot/cameras/opencv/camera_opencv.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
from lerobot.utils.errors import DeviceAlreadyConnectedError, DeviceNotConnectedError

from ..camera import Camera
from ..utils import get_cv2_backend, get_cv2_rotation
from ..utils import get_cv2_backend, get_cv2_rotation, get_image_modality_key
from .configuration_opencv import ColorMode, OpenCVCameraConfig

# NOTE(Steven): The maximum opencv device index depends on your operating system. For instance,
Expand Down Expand Up @@ -289,7 +289,7 @@ def find_cameras() -> list[dict[str, Any]]:

return found_cameras_info

def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
def read(self, color_mode: ColorMode | None = None) -> dict[str, np.ndarray]:
"""
Reads a single frame synchronously from the camera.

Expand All @@ -301,11 +301,6 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
color mode (`self.color_mode`) for this read operation (e.g.,
request RGB even if default is BGR).

Returns:
np.ndarray: The captured frame as a NumPy array in the format
(height, width, channels), using the specified or default
color mode and applying any configured rotation.

Raises:
DeviceNotConnectedError: If the camera is not connected.
RuntimeError: If reading the frame from the camera fails or if the
Expand All @@ -327,7 +322,8 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
read_duration_ms = (time.perf_counter() - start_time) * 1e3
logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")

return processed_frame
image_modality_key = get_image_modality_key(image=processed_frame)
return {image_modality_key: processed_frame}

def _postprocess_image(self, image: np.ndarray, color_mode: ColorMode | None = None) -> np.ndarray:
"""
Expand Down Expand Up @@ -419,7 +415,7 @@ def _stop_read_thread(self) -> None:
self.thread = None
self.stop_event = None

def async_read(self, timeout_ms: float = 200) -> np.ndarray:
def async_read(self, timeout_ms: float = 200) -> dict[str, np.ndarray]:
"""
Reads the latest available frame asynchronously.

Expand All @@ -432,7 +428,7 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
to become available. Defaults to 200ms (0.2 seconds).

Returns:
np.ndarray: The latest captured frame as a NumPy array in the format
dict[str, np.ndarray]: A map of the latest captured frames as a NumPy array in the format
(height, width, channels), processed according to configuration.

Raises:
Expand Down
16 changes: 10 additions & 6 deletions src/lerobot/cameras/reachy2_camera/reachy2_camera.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
from threading import Event, Lock, Thread
from typing import Any

from ..utils import get_image_modality_key

# Fix MSMF hardware transform compatibility for Windows before importing cv2
if platform.system() == "Windows" and "OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS" not in os.environ:
os.environ["OPENCV_VIDEOIO_MSMF_ENABLE_HW_TRANSFORMS"] = "0"
Expand Down Expand Up @@ -131,7 +133,7 @@ def find_cameras(ip_address: str = "localhost", port: int = 50065) -> list[dict[
camera_manager.disconnect()
return initialized_cameras

def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
def read(self, color_mode: ColorMode | None = None) -> dict[str, np.ndarray]:
"""
Reads a single frame synchronously from the camera.

Expand All @@ -143,7 +145,7 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
request RGB even if default is BGR).

Returns:
np.ndarray: The captured frame as a NumPy array in the format
dict[str, np.ndarray]: The captured frame as a NumPy array in the format
(height, width, channels), using the specified or default
color mode and applying any configured rotation.
"""
Expand Down Expand Up @@ -177,7 +179,8 @@ def read(self, color_mode: ColorMode | None = None) -> np.ndarray:
read_duration_ms = (time.perf_counter() - start_time) * 1e3
logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")

return frame
image_modality_key = get_image_modality_key(image=frame)
return {image_modality_key: frame}

def _read_loop(self):
"""
Expand Down Expand Up @@ -226,7 +229,7 @@ def _stop_read_thread(self) -> None:
self.thread = None
self.stop_event = None

def async_read(self, timeout_ms: float = 200) -> np.ndarray:
def async_read(self, timeout_ms: float = 200) -> dict[str, np.ndarray]:
"""
Reads the latest available frame asynchronously.

Expand All @@ -239,7 +242,7 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
to become available. Defaults to 200ms (0.2 seconds).

Returns:
np.ndarray: The latest captured frame as a NumPy array in the format
dict[str, np.ndarray]: A map of the latest captured frames as a NumPy array in the format
(height, width, channels), processed according to configuration.

Raises:
Expand Down Expand Up @@ -267,7 +270,8 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
if frame is None:
raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")

return frame
image_modality_key = get_image_modality_key(image=frame)
return {image_modality_key: frame}

def disconnect(self):
"""
Expand Down
18 changes: 10 additions & 8 deletions src/lerobot/cameras/realsense/camera_realsense.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@

from ..camera import Camera
from ..configs import ColorMode
from ..utils import get_cv2_rotation
from ..utils import get_cv2_rotation, get_image_modality_key
from .configuration_realsense import RealSenseCameraConfig

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -351,7 +351,7 @@ def read_depth(self, timeout_ms: int = 200) -> np.ndarray:

return depth_map_processed

def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np.ndarray:
def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> dict[str, np.ndarray]:
"""
Reads a single frame (color) synchronously from the camera.

Expand All @@ -362,7 +362,7 @@ def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np
timeout_ms (int): Maximum time in milliseconds to wait for a frame. Defaults to 200ms.

Returns:
np.ndarray: The captured color frame as a NumPy array
dict[str, np.ndarray]: A map of the captured color frame as a NumPy array
(height, width, channels), processed according to `color_mode` and rotation.

Raises:
Expand All @@ -389,7 +389,8 @@ def read(self, color_mode: ColorMode | None = None, timeout_ms: int = 200) -> np
read_duration_ms = (time.perf_counter() - start_time) * 1e3
logger.debug(f"{self} read took: {read_duration_ms:.1f}ms")

return color_image_processed
image_modality_key = get_image_modality_key(image=frame)
return {image_modality_key: frame}

def _postprocess_image(
self, image: np.ndarray, color_mode: ColorMode | None = None, depth_frame: bool = False
Expand Down Expand Up @@ -486,7 +487,7 @@ def _stop_read_thread(self):
self.stop_event = None

# NOTE(Steven): Missing implementation for depth for now
def async_read(self, timeout_ms: float = 200) -> np.ndarray:
def async_read(self, timeout_ms: float = 200) -> dict[str, np.ndarray]:
"""
Reads the latest available frame data (color) asynchronously.

Expand All @@ -499,8 +500,8 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
to become available. Defaults to 200ms (0.2 seconds).

Returns:
np.ndarray:
The latest captured frame data (color image), processed according to configuration.
dict[str, np.ndarray]:
A map of the latest captured frame data (color image), processed according to configuration.

Raises:
DeviceNotConnectedError: If the camera is not connected.
Expand All @@ -527,7 +528,8 @@ def async_read(self, timeout_ms: float = 200) -> np.ndarray:
if frame is None:
raise RuntimeError(f"Internal error: Event set but no frame available for {self}.")

return frame
image_modality_key = get_image_modality_key(image=frame)
return {image_modality_key: frame}

def disconnect(self):
"""
Expand Down
51 changes: 51 additions & 0 deletions src/lerobot/cameras/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@
import platform
from typing import cast

import numpy as np

from lerobot.utils.import_utils import make_device_from_device_class

from .camera import Camera
Expand All @@ -43,6 +45,11 @@ def make_cameras_from_configs(camera_configs: dict[str, CameraConfig]) -> dict[s

cameras[key] = Reachy2Camera(cfg)

elif cfg.type == "zed":
from .zed import ZedCamera

cameras[key] = ZedCamera(cfg)

else:
try:
cameras[key] = cast(Camera, make_device_from_device_class(cfg))
Expand Down Expand Up @@ -74,3 +81,47 @@ def get_cv2_backend() -> int:
# return cv2.CAP_AVFOUNDATION
else: # Linux and others
return cv2.CAP_ANY


def get_image_modality_key(image: np.ndarray, is_depth: bool = False) -> str:
"""
Determine the modality key based on image array shape.

Args:
image: Image array from the camera. Should be a numpy array.
is_depth: If True, explicitly treat as depth modality regardless of shape.

Returns:
str: Modality key indicating the image type:
- 'depth': Depth maps
- 'gray': Grayscale images (H, W) or (H, W, 1)
- 'rgb': RGB images (H, W, 3)
- 'rgba': RGBA images (H, W, 4)
- 'unknown': For unsupported array shapes

Raises:
ValueError: If image is not a numpy array

Example:
>>> get_image_modality_key(np.zeros((480, 640))) # 'gray'
>>> get_image_modality_key(np.zeros((480, 640, 3))) # 'rgb'
>>> get_image_modality_key(np.zeros((480, 640)), is_depth=True) # 'depth'
"""
if not isinstance(image, np.ndarray):
raise ValueError(f"Expected numpy array, got {type(image)}")

if is_depth:
return "depth"

if len(image.shape) == 2:
return "gray"
elif len(image.shape) == 3:
channels = image.shape[2]
if channels == 1:
return "gray"
elif channels == 3:
return "rgb"
elif channels == 4:
return "rgba"

return "unknown"
2 changes: 2 additions & 0 deletions src/lerobot/cameras/zed/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from .camera_zed import ZedCamera
from .configuration_zed import ZedCameraConfig
Loading