From c95292e0336e6c35e86eb9110e4abab455117aac Mon Sep 17 00:00:00 2001 From: Rodrigo Barbosa Date: Wed, 12 Nov 2025 16:07:21 +0100 Subject: [PATCH] fix on dynamic config and data only examples --- examples/webrtc_sdk/data_only_example.py | 182 ++++++++++ examples/webrtc_sdk/dynamic_config_example.py | 316 ++++++++++++++++++ inference_sdk/webrtc/__init__.py | 3 +- inference_sdk/webrtc/config.py | 36 ++ inference_sdk/webrtc/session.py | 135 +++++++- 5 files changed, 670 insertions(+), 2 deletions(-) create mode 100644 examples/webrtc_sdk/data_only_example.py create mode 100644 examples/webrtc_sdk/dynamic_config_example.py diff --git a/examples/webrtc_sdk/data_only_example.py b/examples/webrtc_sdk/data_only_example.py new file mode 100644 index 0000000000..b273081883 --- /dev/null +++ b/examples/webrtc_sdk/data_only_example.py @@ -0,0 +1,182 @@ +""" +WebRTC SDK example demonstrating DATA_ONLY output mode. + +This example shows how to use the DATA_ONLY mode to receive only inference +results without video feedback, which significantly reduces bandwidth usage. + +DATA_ONLY mode is ideal for: +- Analytics and metrics collection +- Headless inference servers +- High-throughput object counting +- Logging detections for later analysis +- IoT devices with limited bandwidth + +Usage: + python examples/webrtc_sdk/data_only_example.py \ + --workspace-name \ + --workflow-id \ + [--api-url http://localhost:9001] \ + [--api-key ] \ + [--duration 30] + +Press Ctrl+C to stop early. +""" +import argparse +import time +from collections import defaultdict +from datetime import datetime + +from inference_sdk import InferenceHTTPClient +from inference_sdk.webrtc import OutputMode, StreamConfig, VideoMetadata, WebcamSource + + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser("WebRTC SDK Data-Only Mode Example") + p.add_argument("--api-url", default="http://localhost:9001") + p.add_argument("--workspace-name", required=True) + p.add_argument("--workflow-id", required=True) + p.add_argument("--image-input-name", default="image") + p.add_argument("--api-key", default=None) + p.add_argument( + "--duration", + type=int, + default=30, + help="How long to run in seconds (default: 30)", + ) + p.add_argument( + "--data-fields", + type=str, + default=None, + help="Comma-separated list of fields to receive (default: all outputs)", + ) + return p.parse_args() + + +def main() -> None: + args = parse_args() + client = InferenceHTTPClient.init(api_url=args.api_url, api_key=args.api_key) + + # Prepare source + source = WebcamSource() + + # Configure data output fields + if args.data_fields: + data_output = [f.strip() for f in args.data_fields.split(",")] + else: + data_output = [] # Empty list means all outputs + + # Configure for DATA_ONLY mode - no video will be sent back + config = StreamConfig( + output_mode=OutputMode.DATA_ONLY, # Only data, no video + data_output=data_output, # What fields to receive + realtime_processing=True, # Process frames in realtime + ) + + # Statistics tracking + stats = { + "frames_processed": 0, + "start_time": time.time(), + "detections_per_frame": [], + "field_counts": defaultdict(int), + } + + print("\n" + "=" * 70) + print("WebRTC SDK - DATA_ONLY Mode Example") + print("=" * 70) + print(f"\nConfiguration:") + print(f" Output Mode: DATA_ONLY (no video feedback)") + print(f" Data Fields: {data_output if data_output else 'ALL workflow outputs'}") + print(f" Duration: {args.duration} seconds") + print(f" API URL: {args.api_url}") + print(f"\nStarting session... (Press Ctrl+C to stop early)") + print("-" * 70 + "\n") + + # Start streaming session + with client.webrtc.stream( + source=source, + workflow=args.workflow_id, + workspace=args.workspace_name, + image_input=args.image_input_name, + config=config, + ) as session: + + # Global data handler - receives all workflow outputs + @session.on_data() + def handle_all_data(data: dict, metadata: VideoMetadata): + stats["frames_processed"] += 1 + frame_num = stats["frames_processed"] + + # Track which fields we received + if data: + for field_name in data.keys(): + stats["field_counts"][field_name] += 1 + + # Print periodic updates with property_definition value + if frame_num % 10 == 0: + elapsed = time.time() - stats["start_time"] + fps = frame_num / elapsed if elapsed > 0 else 0 + + # Extract property_definition value if present + property_value = data.get("property_definition", "N/A") if data else "N/A" + + print( + f"Frame {frame_num:4d} | " + f"FPS: {fps:5.1f} | " + f"property_definition: {property_value} | " + f"Fields: {list(data.keys()) if data else 'none'}" + ) + + # Field-specific handler for predictions (if available) + @session.on_data("predictions") + def handle_predictions(predictions: dict, metadata: VideoMetadata): + # Count detections + if isinstance(predictions, dict) and "predictions" in predictions: + num_detections = len(predictions["predictions"]) + stats["detections_per_frame"].append(num_detections) + + # Log significant events + if num_detections > 5: + print(f" → High activity: {num_detections} detections!") + + # Run for specified duration + start_time = time.time() + try: + while time.time() - start_time < args.duration: + time.sleep(0.1) # Small sleep to prevent busy loop + except KeyboardInterrupt: + print("\n\nStopped by user.") + + # Print final statistics + elapsed = time.time() - stats["start_time"] + print("\n" + "=" * 70) + print("Session Statistics") + print("=" * 70) + print(f"\nDuration: {elapsed:.1f} seconds") + print(f"Frames Processed: {stats['frames_processed']}") + print(f"Average FPS: {stats['frames_processed'] / elapsed:.1f}") + + if stats["field_counts"]: + print(f"\nFields Received:") + for field, count in sorted(stats["field_counts"].items()): + print(f" {field}: {count} frames") + + if stats["detections_per_frame"]: + total_detections = sum(stats["detections_per_frame"]) + avg_detections = total_detections / len(stats["detections_per_frame"]) + max_detections = max(stats["detections_per_frame"]) + print(f"\nDetection Statistics:") + print(f" Total Detections: {total_detections}") + print(f" Average per Frame: {avg_detections:.1f}") + print(f" Max in Single Frame: {max_detections}") + + print("\n" + "=" * 70) + print("\nšŸ’” Benefits of DATA_ONLY mode:") + print(" āœ“ Significantly reduced bandwidth (no video sent back)") + print(" āœ“ Lower latency for data processing") + print(" āœ“ Ideal for headless/server deployments") + print(" āœ“ Perfect for analytics and logging use cases") + print("\n") + + +if __name__ == "__main__": + main() diff --git a/examples/webrtc_sdk/dynamic_config_example.py b/examples/webrtc_sdk/dynamic_config_example.py new file mode 100644 index 0000000000..a490e70505 --- /dev/null +++ b/examples/webrtc_sdk/dynamic_config_example.py @@ -0,0 +1,316 @@ +""" +WebRTC SDK example demonstrating dynamic channel configuration. + +This example shows how to change stream and data outputs in real-time +during an active WebRTC session without reconnecting. Uses a workflow +specification directly (no need for workspace/workflow-id). + +Usage: + python examples/webrtc_sdk/dynamic_config_example.py \ + [--api-url http://localhost:9001] \ + [--api-key ] \ + [--width 1920] \ + [--height 1080] + +Controls: + q - Quit + + - Enable all data outputs + - - Disable all data outputs + a-z - Toggle individual data outputs + 0 - Disable video output + 1-9 - Switch video output + +The example uses a workflow specification defined in the code, so no need +for workspace/workflow-id parameters. Press keys in the preview window to +dynamically control which outputs are sent. +""" +import argparse +import json + +import cv2 + +from inference_sdk import InferenceHTTPClient +from inference_sdk.webrtc import VideoMetadata, WebcamSource, StreamConfig + +# Example workflow specification +# This is a simple workflow that runs object detection and provides outputs +WORKFLOW_SPEC_JSON = """{ + "version": "1.0", + "inputs": [ + { + "type": "InferenceImage", + "name": "image" + } + ], + "steps": [ + { + "type": "roboflow_core/relative_statoic_crop@v1", + "name": "relative_static_crop", + "images": "$inputs.image", + "x_center": 0.5, + "y_center": 0.5, + "width": 0, + "height": 0.5 + }, + { + "type": "roboflow_core/property_definition@v1", + "name": "property_definition", + "data": "$inputs.image", + "operations": [ + { + "type": "ExtractImageProperty", + "property_name": "aspect_ratio" + } + ] + }, + { + "type": "roboflow_core/image_blur@v1", + "name": "image_blur", + "image": "$inputs.image" + } + ], + "outputs": [ + { + "type": "JsonField", + "name": "image_blur", + "coordinates_system": "own", + "selector": "$steps.image_blur.image" + }, + { + "type": "JsonField", + "name": "image", + "coordinates_system": "own", + "selector": "$steps.relative_static_crop.crops" + }, + { + "type": "JsonField", + "name": "original_ratio", + "coordinates_system": "own", + "selector": "$steps.property_definition.output" + } + ] +}""" + +# Parse the JSON specification into a Python dict +WORKFLOW_SPEC = json.loads(WORKFLOW_SPEC_JSON) + +def parse_args() -> argparse.Namespace: + p = argparse.ArgumentParser("WebRTC SDK Dynamic Configuration Example") + p.add_argument("--api-url", default="http://localhost:9001") + p.add_argument("--image-input-name", default="image") + p.add_argument("--api-key", default=None) + p.add_argument("--width", type=int, default=None) + p.add_argument("--height", type=int, default=None) + return p.parse_args() + + +def main() -> None: + args = parse_args() + client = InferenceHTTPClient.init(api_url=args.api_url, api_key=args.api_key) + + # Extract available outputs from workflow specification + workflow_outputs = WORKFLOW_SPEC.get("outputs", []) + available_output_names = [o.get("name") for o in workflow_outputs] + + if not workflow_outputs: + print("āš ļø Workflow has no outputs defined") + return + + print(f"Available workflow outputs: {available_output_names}") + + # Prepare source + resolution = None + if args.width and args.height: + resolution = (args.width, args.height) + source = WebcamSource(resolution=resolution) + + # Start with some outputs configured + config = StreamConfig( + stream_output=[available_output_names[0]] if available_output_names else [], # Use first output + data_output=[] # Start with no data outputs + ) + + # Start streaming session with workflow specification + session = client.webrtc.stream( + source=source, + workflow=WORKFLOW_SPEC, # Pass workflow spec directly + image_input=args.image_input_name, + config=config, + ) + + with session: + # Track current configuration state for display + current_data_mode = "none" + active_data_fields = [] # For custom mode + + def draw_output_list(frame): + """Draw list of available outputs with active indicators""" + x_start = 10 + y_start = 80 + line_height = 22 + + # Title + if current_data_mode == "all": + title = "Data Outputs (ALL)" + title_color = (100, 255, 100) + elif current_data_mode == "none": + title = "Data Outputs (NONE)" + title_color = (100, 100, 100) + else: + title = f"Data Outputs ({len(active_data_fields)} active)" + title_color = (100, 200, 255) + + cv2.putText(frame, title, (x_start, y_start), cv2.FONT_HERSHEY_SIMPLEX, 0.5, title_color, 1, cv2.LINE_AA) + y_start += line_height + 5 + + # Draw each output + for i, output in enumerate(workflow_outputs): + key_letter = chr(ord("a") + i) if i < 26 else "?" + output_name = output.get("name", "unnamed") + + # Determine if active + if current_data_mode == "all": + is_active = True + elif current_data_mode == "none": + is_active = False + else: + is_active = output_name in active_data_fields + + # Format line with ASCII checkbox + indicator = "[X]" if is_active else "[ ]" + color = (100, 255, 100) if is_active else (100, 100, 100) + text = f" [{key_letter}] {indicator} {output_name}" + + cv2.putText( + frame, + text, + (x_start, y_start + i * line_height), + cv2.FONT_HERSHEY_SIMPLEX, + 0.45, + color, + 1, + cv2.LINE_AA + ) + + # Controls + y_controls = y_start + len(workflow_outputs) * line_height + 10 + cv2.putText( + frame, + " [+] All [-] None [1-9] Video Output", + (x_start, y_controls), + cv2.FONT_HERSHEY_SIMPLEX, + 0.45, + (200, 200, 200), + 1, + cv2.LINE_AA + ) + + @session.on_frame + def show_frame(frame, metadata): + nonlocal current_data_mode, active_data_fields + + # Draw output list overlay + draw_output_list(frame) + + # Add controls hint at bottom + controls = "q=quit | +=all | -=none | a-z=toggle data | 0-9=video" + cv2.putText( + frame, + controls, + (10, frame.shape[0] - 10), + cv2.FONT_HERSHEY_SIMPLEX, + 0.4, + (200, 200, 200), + 1 + ) + + cv2.imshow("WebRTC SDK - Dynamic Configuration", frame) + + # Handle keyboard input + key = cv2.waitKey(1) & 0xFF + + if key == ord("q"): + print("Quitting...") + session.stop() + + elif key == ord("+") or key == ord("="): + print("Setting data output to ALL") + session.set_data_outputs(None) + current_data_mode = "all" + + elif key == ord("-"): + print("Setting data output to NONE") + session.set_data_outputs([]) + current_data_mode = "none" + + elif key == ord("0"): + print("Disabling video output") + session.set_stream_output("") + + # Handle 1-9 keys for video output selection + elif ord("1") <= key <= ord("9"): + output_index = key - ord("1") + if output_index < len(available_output_names): + output_name = available_output_names[output_index] + print(f"Switching video to '{output_name}'") + session.set_stream_output(output_name) + + # Handle a-z keys for data output toggling + elif chr(key).isalpha() and chr(key).lower() in "abcdefghijklmnopqrstuvwxyz": + key_index = ord(chr(key).lower()) - ord("a") + if key_index < len(workflow_outputs): + output_name = workflow_outputs[key_index].get("name", "") + + # Toggle logic + if current_data_mode == "all": + # Was "all", switch to custom with all except this one + current_data_mode = "custom" + active_data_fields = list(available_output_names) + active_data_fields.remove(output_name) + print(f"Toggled OFF '{output_name}' (was ALL)") + elif current_data_mode == "none": + # Was "none", enable only this field + current_data_mode = "custom" + active_data_fields = [output_name] + print(f"Toggled ON '{output_name}' (was NONE)") + else: + # Custom mode - toggle + if output_name in active_data_fields: + active_data_fields.remove(output_name) + print(f"Toggled OFF '{output_name}'") + else: + active_data_fields.append(output_name) + print(f"Toggled ON '{output_name}'") + + # Send updated list + print(f"Active fields: {active_data_fields}") + session.set_data_outputs(active_data_fields if active_data_fields else []) + + # Global data handler to monitor what we're receiving + @session.on_data() + def handle_data(data: dict, metadata: VideoMetadata): + if data: + print(f"Frame {metadata.frame_id}: Received fields: {list(data.keys())}") + else: + print(f"Frame {metadata.frame_id}: No data (metadata only)") + + # Run the session (blocks until stop() is called or stream ends) + print("\n=== WebRTC Dynamic Configuration Example ===") + print(f"Available outputs: {available_output_names}") + print("\nControls:") + print(" q - Quit") + print(" + - Enable all data outputs") + print(" - - Disable all data outputs (metadata only)") + for i, output in enumerate(workflow_outputs): + key_letter = chr(ord("a") + i) if i < 26 else "?" + print(f" {key_letter} - Toggle '{output.get('name')}' data output") + print(" 0 - Disable video output") + for i, name in enumerate(available_output_names[:9]): + print(f" {i+1} - Switch video to '{name}'") + print("\nPress keys in the video window to control outputs dynamically.\n") + + session.run() + + +if __name__ == "__main__": + main() diff --git a/inference_sdk/webrtc/__init__.py b/inference_sdk/webrtc/__init__.py index c9deb7baf8..bd336d5060 100644 --- a/inference_sdk/webrtc/__init__.py +++ b/inference_sdk/webrtc/__init__.py @@ -1,7 +1,7 @@ """WebRTC SDK for Inference - Unified streaming API.""" from .client import WebRTCClient # noqa: F401 -from .config import StreamConfig # noqa: F401 +from .config import OutputMode, StreamConfig # noqa: F401 from .session import VideoMetadata, WebRTCSession # noqa: F401 from .sources import ( # noqa: F401 ManualSource, @@ -17,6 +17,7 @@ "WebRTCSession", "StreamConfig", "VideoMetadata", + "OutputMode", # Source classes "StreamSource", "WebcamSource", diff --git a/inference_sdk/webrtc/config.py b/inference_sdk/webrtc/config.py index 6fc1460aa4..f4ca9f6448 100644 --- a/inference_sdk/webrtc/config.py +++ b/inference_sdk/webrtc/config.py @@ -1,9 +1,42 @@ """Configuration for WebRTC streaming sessions.""" from dataclasses import dataclass, field +from enum import Enum from typing import Any, Dict, List, Optional +class OutputMode(str, Enum): + """Output mode for WebRTC sessions. + + Determines what data is sent back from the server during processing: + + - DATA_ONLY: Only send JSON data via data channel (no video track sent back). + Use this when you only need inference results/metrics and want to + save bandwidth. The server won't send processed video frames back. + + - VIDEO_ONLY: Only send processed video via video track (no data channel messages). + Use this when you only need to display the processed video and don't + need programmatic access to results. + + - BOTH: Send both processed video and JSON data (default behavior). + Use this when you need both visual output and programmatic access to results. + + Examples: + # Data-only mode for analytics/logging (saves bandwidth) + config = StreamConfig(output_mode=OutputMode.DATA_ONLY) + + # Video-only mode for display-only applications + config = StreamConfig(output_mode=OutputMode.VIDEO_ONLY) + + # Both (default) for full-featured applications + config = StreamConfig(output_mode=OutputMode.BOTH) + """ + + DATA_ONLY = "data_only" + VIDEO_ONLY = "video_only" + BOTH = "both" + + @dataclass class StreamConfig: """Unified configuration for all WebRTC stream types. @@ -19,6 +52,9 @@ class StreamConfig: data_output: List[str] = field(default_factory=list) """List of workflow output names to receive via data channel""" + output_mode: OutputMode = OutputMode.BOTH + """Output mode: DATA_ONLY (data channel only), VIDEO_ONLY (video only), or BOTH (default)""" + # Processing configuration realtime_processing: bool = True """Whether to process frames in realtime (drop if can't keep up) or queue all frames""" diff --git a/inference_sdk/webrtc/session.py b/inference_sdk/webrtc/session.py index e6c0bf1f09..d7bbd7dbb2 100644 --- a/inference_sdk/webrtc/session.py +++ b/inference_sdk/webrtc/session.py @@ -20,6 +20,9 @@ from inference_sdk.webrtc.config import StreamConfig from inference_sdk.webrtc.sources import StreamSource +# Sentinel value to distinguish "not provided" from "None" +_UNSET = object() + if TYPE_CHECKING: from aiortc import RTCDataChannel, RTCPeerConnection @@ -337,6 +340,7 @@ def __init__( self._data_field_handlers: dict[str, List[Callable]] = {} self._data_global_handler: Optional[Callable] = None self._stop_event: threading.Event = threading.Event() + self._data_channel: Optional["RTCDataChannel"] = None # Public APIs self.video = _VideoStream(self._video_queue) @@ -555,6 +559,122 @@ def process(frame, metadata): if self._stop_event.is_set(): break + def _send_config_message( + self, + stream_output: Any = _UNSET, # noqa: ANN401 + data_output: Any = _UNSET, # noqa: ANN401 + ) -> None: + """Send configuration message to server via data channel. + + Args: + stream_output: Value to set for stream_output field (_UNSET = don't change) + data_output: Value to set for data_output field (_UNSET = don't change) + + Raises: + RuntimeError: If data channel is not open or not initialized + """ + if not self._data_channel: + raise RuntimeError( + "Data channel not initialized. This method can only be called " + "within the WebRTCSession context (after __enter__)." + ) + + if self._data_channel.readyState != "open": + raise RuntimeError( + f"Data channel is not open (state: {self._data_channel.readyState}). " + "Wait for the connection to be established before changing configuration." + ) + + # Build message dict with only the fields to change + message_dict = {} + if stream_output is not _UNSET: + message_dict["stream_output"] = stream_output + if data_output is not _UNSET: + message_dict["data_output"] = data_output + + if not message_dict: + # Nothing to send + return + + # Serialize and send + message_json = json.dumps(message_dict) + + # Send from the async event loop thread + def _send(): + self._data_channel.send(message_json) + + self._loop.call_soon_threadsafe(_send) + + def set_stream_output(self, output_name: Optional[str]) -> None: + """Change which workflow output is rendered on the video track. + + This allows dynamically switching which workflow output is used for + the video stream without reconnecting. Useful for workflows with + multiple visualization outputs. + + Args: + output_name: Name of workflow output to use for video rendering. + - None: Disable rendering / trigger auto-detection + - "" (empty string): Disable rendering / trigger auto-detection + - "output_name": Use specific workflow output + + Raises: + RuntimeError: If data channel is not open or not initialized + + Examples: + # Switch to specific output + session.set_stream_output("visualization") + + # Disable video rendering + session.set_stream_output(None) + + # Let server auto-detect best output + session.set_stream_output("") + + Note: + The server does not validate the output name. If you specify an + invalid output, errors will appear in the data channel responses. + """ + self._send_config_message(stream_output=output_name) + + def set_data_outputs(self, output_names: Optional[List[str]]) -> None: + """Change which workflow outputs are sent via data channel. + + This allows dynamically controlling which workflow outputs are sent + over the data channel without reconnecting. Useful for reducing + bandwidth or focusing on specific outputs. + + Args: + output_names: List of workflow output names to send. + - None: Send ALL workflow outputs + - []: Send NO outputs (metadata only) + - ["field1", "field2"]: Send only specified fields + + Raises: + RuntimeError: If data channel is not open or not initialized + + Examples: + # Send all outputs + session.set_data_outputs(None) + + # Send only metadata (no workflow outputs) + session.set_data_outputs([]) + + # Send specific fields + session.set_data_outputs(["predictions", "visualization"]) + + # Send single field + session.set_data_outputs(["predictions"]) + + Note: + - The server does not validate output names. Invalid names will + result in errors in the data channel responses. + - Images are only serialized when explicitly requested in the list. + - Using None (all outputs) will skip image serialization to save + bandwidth. + """ + self._send_config_message(data_output=output_names) + def _invoke_data_handler( self, handler: Callable, value: Any, metadata: Optional[VideoMetadata] ) -> None: # noqa: ANN401 @@ -725,6 +845,9 @@ async def _reader(): # Keep old binding for tests that still use session.data self.data.bind(ch) + # Store reference for dynamic configuration + self._data_channel = ch + # Setup new data channel message handler @ch.on("message") def _on_data_message(message: Any) -> None: # noqa: ANN401 @@ -810,6 +933,7 @@ def _on_data_message(message: Any) -> None: # noqa: ANN401 "webrtc_realtime_processing": self._config.realtime_processing, "stream_output": self._config.stream_output, "data_output": self._config.data_output, + "output_mode": self._config.output_mode.value, } # Add TURN config if available (auto-fetched or user-provided) @@ -828,7 +952,16 @@ def _on_data_message(message: Any) -> None: # noqa: ANN401 url = f"{self._api_url}/initialise_webrtc_worker" headers = {"Content-Type": "application/json"} resp = requests.post(url, json=payload, headers=headers, timeout=90) - resp.raise_for_status() + try: + resp.raise_for_status() + except requests.exceptions.HTTPError as e: + # Try to get more details from the response + try: + error_detail = resp.json() + logger.error(f"Server returned error: {error_detail}") + except Exception: + logger.error(f"Server response body: {resp.text}") + raise ans: dict[str, Any] = resp.json() # Set remote description