Merge pull request #27 from kush124k/adding-blackbox

haixuanTao · web-flow · commit f2516ae7b307 · 2026-01-19T10:50:12.000+01:00
Feat: Generic Parquet Recorder Node (dora-parquet-recorder)
diff --git a/node-hub/dora-parquet-recorder/README.md b/node-hub/dora-parquet-recorder/README.md
@@ -0,0 +1,108 @@
+# dora-dataset-record
+
+Node for recording robot datasets in LeRobot format. You can captures synchronized camera feeds and robot poses to create high-quality datasets for imitation learning and robot training.
+
+- **Robot pose recording** - Capture both state and action data
+- **Multi-camera support** - Record from multiple cameras simultaneously
+- **LeRobot dataset format (v2.1)** - Direct integration with HuggingFace LeRobot datasets
+- **Episode management** - Automatic episode segmentation with reset phases
+
+## Quick Start
+
+### 1. Installation
+
+```bash
+# Source your venv
+cd dora/node-hub/dora-dataset-record
+uv pip install -e .
+```
+
+### 2. Usage Guide
+
+Create a dataflow file, see `examples/lerobot-dataset-record/dataset_record.yml`:
+
+```yaml
+nodes:
+  # Dataset recorder
+  - id: dataset_recorder
+    build: pip install -e ../../dora-dataset-record
+    path: dora-dataset-record
+    inputs:
+      laptop: laptop_cam/image
+      front: front_cam/image
+      robot_state: robot_follower/pose
+      robot_action: leader_interface/pose
+    outputs:
+      - text
+    env:
+      # Required settings
+      REPO_ID: "your_username/your_dataset_name"
+      SINGLE_TASK: "Pick up the cube and place it in the box"
+      ROBOT_TYPE: "your_robot_type"
+
+      # Recording settings
+      FPS: "30"
+      TOTAL_EPISODES: "50"
+      EPISODE_DURATION_S: "60"
+      RESET_DURATION_S: "15"
+
+      # Camera configuration
+      CAMERA_NAMES: "laptop,front"
+      CAMERA_LAPTOP_RESOLUTION: "480,640,3"
+      CAMERA_FRONT_RESOLUTION: "480,640,3"
+
+      # Robot configuration
+      ROBOT_JOINTS: "joint1,joint2,joint3,joint4,joint5,gripper"
+
+      # Optional settings
+      USE_VIDEOS: "true"
+      SAVE_AVIF_FRAMES: "true" # This will additionally save frames
+      PUSH_TO_HUB: "false"
+      PRIVATE: "false"
+      TAGS: "robotics,manipulation,imitation_learning"
+
+  # Visualization with rerun
+  - id: plot
+    build: pip install dora-rerun
+    path: dora-rerun
+    inputs:
+      text: dataset_recorder/text
+```
+
+### 3. Start Recording the dataset
+
+```bash
+dora build dataset_record.yml
+dora run dataset_record.yml
+```
+
+The node will send instructions on dora-rerun, about episode starting, reset time, Saving episodes etc.
+
+## Configuration
+
+### Required Environment Variables
+
+| Variable              | Description                  | Example                    |
+| --------------------- | ---------------------------- | -------------------------- |
+| `REPO_ID`             | HuggingFace dataset repo     | `"username/dataset_name"`  |
+| `SINGLE_TASK`         | Task description             | `"Pick and place objects"` |
+| `CAMERA_NAMES`        | Comma-separated camera names | `"laptop,front,top"`       |
+| `CAMERA_*_RESOLUTION` | Resolution for each camera   | `"480,640,3"`              |
+| `ROBOT_JOINTS`        | Comma-separated joint names  | `"joint1,joint2,gripper"`  |
+
+### Optional Settings
+
+| Variable             | Default                                     | Description                                           |
+| -------------------- | ------------------------------------------- | ----------------------------------------------------- |
+| `FPS`                | `30`                                        | Recording frame rate (match camera fps)               |
+| `TOTAL_EPISODES`     | `10`                                        | Number of episodes to record                          |
+| `EPISODE_DURATION_S` | `60`                                        | Episode length in seconds                             |
+| `RESET_DURATION_S`   | `15`                                        | Break between episodes to reset the environment       |
+| `USE_VIDEOS`         | `true`                                      | Encode as MP4 videos, else saves images               |
+| `PUSH_TO_HUB`        | `false`                                     | Upload to HuggingFace Hub                             |
+| `PRIVATE`            | `false`                                     | Make dataset private                                  |
+| `ROOT_PATH`          | `~/.cache/huggingface/lerobot/your_repo_id` | Local storage path where you want to save the dataset |
+
+## License
+
+This project is released under the MIT License.
diff --git a/node-hub/dora-parquet-recorder/dora_parquet_recorder/__init__.py b/node-hub/dora-parquet-recorder/dora_parquet_recorder/__init__.py
diff --git a/node-hub/dora-parquet-recorder/dora_parquet_recorder/__main__.py b/node-hub/dora-parquet-recorder/dora_parquet_recorder/__main__.py
@@ -0,0 +1,4 @@
+from .main import main
+
+if __name__ == "__main__":
+    main()
diff --git a/node-hub/dora-parquet-recorder/dora_parquet_recorder/main.py b/node-hub/dora-parquet-recorder/dora_parquet_recorder/main.py
@@ -0,0 +1,152 @@
+"""
+High-Performance Batched Parquet Recorder
+"""
+
+import os
+import queue
+import threading
+import json
+import pyarrow as pa
+import pyarrow.parquet as pq
+from dora import Node
+from datetime import datetime
+from typing import Any
+
+# CONFIGURATION
+BATCH_SIZE = int(os.getenv("BATCH_SIZE", "30")) 
+LOG_DIR = os.getenv("LOG_DIR", "data_logs")
+
+class DoraParquetRecorder:
+    def __init__(self):
+        self.write_queue = queue.Queue()
+        self.writers = {}
+        self.shutdown_flag = False
+        
+        os.makedirs(LOG_DIR, exist_ok=True)
+        
+        # Start the background writer
+        self.writer_thread = threading.Thread(target=self._writer_loop, daemon=True)
+        self.writer_thread.start()
+        print(f"[Recorder] Online. Batch Size: {BATCH_SIZE}", flush=True)
+
+    def _writer_loop(self):
+        """
+        Collects small tables and writes them in big chunks.
+        """
+        # Buffer to hold tables for each input_id: { "cam_feed": [table1, table2...] }
+        buffers = {}
+
+        while not self.shutdown_flag or not self.write_queue.empty():
+            try:
+                # 1. Get data (Wait up to 0.1s so we can check shutdown flag often)
+                data = self.write_queue.get(timeout=0.1)
+                input_id, table = data
+                
+                # 2. Add to local buffer
+                if input_id not in buffers:
+                    buffers[input_id] = []
+                buffers[input_id].append(table)
+                
+                # 3. Check if bucket is full
+                if len(buffers[input_id]) >= BATCH_SIZE:
+                    self._flush_buffer(input_id, buffers[input_id])
+                    buffers[input_id] = [] # Empty the bucket
+
+            except queue.Empty:
+                continue
+            except Exception as e:
+                print(f"[Recorder] Write error: {e}", flush=True)
+
+        # FINAL CLEANUP: Flush whatever is left in the buckets
+        print("[Recorder] Flushing remaining data...", flush=True)
+        for input_id, buf in buffers.items():
+            if buf:
+                self._flush_buffer(input_id, buf)
+        
+        # Close files
+        for w in self.writers.values():
+            w.close()
+
+    def _flush_buffer(self, input_id, table_list):
+        """Merges small tables into one big table and writes it."""
+        try:
+            if not table_list:
+                return
+                
+            # Combine 30 small tables into 1 big table (Very fast)
+            batch_table = pa.concat_tables(table_list)
+            
+            # Create writer if it doesn't exist
+            if input_id not in self.writers:
+                file_path = os.path.join(LOG_DIR, f"{input_id}.parquet")
+                # 'compression=None' is faster for CPU, 'snappy' saves disk space
+                self.writers[input_id] = pq.ParquetWriter(file_path, batch_table.schema, compression='NONE')
+                print(f"[Recorder] Created log: {file_path}", flush=True)
+            
+            # One single write for 30 frames!
+            self.writers[input_id].write_table(batch_table)
+            
+        except Exception as e:
+            print(f"[Recorder] Flush failed: {e}", flush=True)
+
+    def handle_input(self, input_id: str, value: Any, metadata: Any):
+        if self.shutdown_flag:
+            return
+
+        try:
+            # 1. Fast Metadata Serialize
+            meta_json = json.dumps(metadata)
+            
+            # 2. Fast Binary Copy (Zero-Copyish)
+            # Try to get raw C-buffer bytes if possible
+            if hasattr(value, "buffers"):
+                try:
+                    data_blob = value.buffers()[1].to_pybytes()
+                except:
+                    data_blob = value.to_string().encode('utf-8')
+            else:
+                # Fallback for strings/other types
+                if not isinstance(value, (pa.Array, pa.ChunkedArray)):
+                    value = pa.array([value])
+                data_blob = value.to_pylist()[0] # Fallback (slower but safe)
+
+            # 3. Queue it up
+            timestamp = datetime.now().isoformat()
+            
+            table = pa.Table.from_pydict({
+                "timestamp": [timestamp],
+                "data": [data_blob],
+                "metadata": [meta_json]
+            })
+
+            self.write_queue.put((input_id, table))
+
+        except Exception as e:
+            print(f"[Recorder] Serialize error: {e}", flush=True)
+
+    def _shutdown(self):
+        self.shutdown_flag = True
+        if self.writer_thread.is_alive():
+            self.writer_thread.join(timeout=5.0)
+
+def main():
+    node = Node()
+    recorder = DoraParquetRecorder()
+
+    # --- HANDSHAKE ---
+    print("[Recorder] Ready. Sending Signal...", flush=True)
+    node.send_output("status", pa.array(["READY"])) 
+    # -----------------
+
+    for event in node:
+        if event["type"] == "INPUT":
+            recorder.handle_input(
+                event["id"], event["value"], event.get("metadata", {})
+            )
+        elif event["type"] == "STOP":
+            break
+
+    recorder._shutdown()
+
+if __name__ == "__main__":
+    main()
diff --git a/node-hub/dora-parquet-recorder/pyproject.toml b/node-hub/dora-parquet-recorder/pyproject.toml
@@ -0,0 +1,21 @@
+[build-system]
+requires = ["setuptools", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "dora-parquet-recorder"
+version = "0.1.0"
+description = "A generic, zero-copy data recorder for dora."
+readme = "README.md"
+requires-python = ">=3.8"
+dependencies = [
+    "dora-rs",
+    "pyarrow",
+    "pandas",
+    "numpy",
+    "opencv-python"
+]
+
+[project.scripts]
+# This lets users run "dora-parquet-recorder" from command line
+dora-parquet-recorder = "dora_parquet_recorder.main:main"
diff --git a/node-hub/dora-parquet-recorder/tests/test_recorder.py b/node-hub/dora-parquet-recorder/tests/test_recorder.py
@@ -0,0 +1,3 @@
+def test_import():
+    from dora_parquet_recorder.main import DoraParquetRecorder
+    assert DoraParquetRecorder is not None

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +from .main import main
++
 +if __name__ == "__main__":
 +    main()
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+def test_import():`
	`2`	`+ from dora_parquet_recorder.main import DoraParquetRecorder`
	`3`	`+ assert DoraParquetRecorder is not None`