@@ -23,7 +23,7 @@ RTCPeerConnection API in order to build an API that is:
23
23
* Able to support user defined component wrapping and replacement
24
24
25
25
The central idea is to expose components in an RTCPeerConnection as a collection of
26
- streams (as defined by the [ WHATWG Streams API] (https://streams.spec.whatwg.org/ )),
26
+ streams (as defined by the [ WHATWG Streams API] ( https://streams.spec.whatwg.org/ ) ),
27
27
which can be manipulated to introduce new components, or to wrap or replace existing
28
28
components.
29
29
@@ -43,6 +43,12 @@ iterations to support additional use cases such as:
43
43
* Custom codecs for special purposes (in combination with WebCodecs)
44
44
45
45
## Code Examples
46
+ 0 . Feature detection can be done as follows:
47
+
48
+ <pre >
49
+ const supportsInsertableStreams = window.RTCRtpSender &&
50
+ !!RTCRtpSender.prototype.createEncodedStreams;
51
+ </pre >
46
52
47
53
1 . Let an PeerConnection know that it should allow exposing the data flowing through it
48
54
as streams.
@@ -54,8 +60,7 @@ streams. For example:
54
60
55
61
<pre >
56
62
let pc = new RTCPeerConnection({
57
- forceEncodedVideoInsertableStreams: true,
58
- forceEncodedAudioInsertableStreams: true
63
+ encodedInsertableStreams: true,
59
64
});
60
65
</pre >
61
66
@@ -70,25 +75,25 @@ of an encoded frame and adds 4 bytes of padding.
70
75
// Called on startup.
71
76
},
72
77
73
- async transform(chunk , controller) {
74
- let view = new DataView(chunk .data);
78
+ async transform(encodedFrame , controller) {
79
+ let view = new DataView(encodedFrame .data);
75
80
// Create a new buffer with 4 additional bytes.
76
- let newData = new ArrayBuffer(chunk .data.byteLength + 4);
81
+ let newData = new ArrayBuffer(encodedFrame .data.byteLength + 4);
77
82
let newView = new DataView(newData);
78
83
79
84
// Fill the new buffer with a negated version of all
80
85
// the bits in the original frame.
81
- for (let i = 0; i < chunk .data.byteLength; ++i)
86
+ for (let i = 0; i < encodedFrame .data.byteLength; ++i)
82
87
newView.setInt8(i, ~view.getInt8(i));
83
88
// Set the padding bytes to zero.
84
89
for (let i = 0; i < 4; ++i)
85
- newView.setInt8(chunk .data.byteLength + i, 0);
90
+ newView.setInt8(encodedFrame .data.byteLength + i, 0);
86
91
87
92
// Replace the frame's data with the new buffer.
88
- chunk .data = newData;
93
+ encodedFrame .data = newData;
89
94
90
95
// Send it to the output stream.
91
- controller.enqueue(chunk );
96
+ controller.enqueue(encodedFrame );
92
97
},
93
98
94
99
flush() {
@@ -104,7 +109,7 @@ Transform stream to the track's sender.
104
109
let stream = await navigator.mediaDevices.getUserMedia({video:true});
105
110
let [track] = stream.getTracks();
106
111
let videoSender = pc.addTrack(track, stream)
107
- let senderStreams = videoSender.getEncodedVideoStreams ();
112
+ let senderStreams = videoSender.createEncodedStreams ();
108
113
109
114
// Do ICE and offer/answer exchange.
110
115
@@ -116,39 +121,30 @@ senderStreams.readable
116
121
4 . Do the corresponding operations on the receiver side.
117
122
118
123
<pre >
119
- let pc = new RTCPeerConnection({forceEncodedVideoInsertableStreams : true});
124
+ let pc = new RTCPeerConnection({encodedInsertableStreams : true});
120
125
pc.ontrack = e => {
121
- let receivers = pc.getReceivers();
122
- let videoReceiver = null;
123
- for (const r of receivers) {
124
- if (r.track.kind == 'video')
125
- videoReceiver = r;
126
- }
127
- if (!videoReceiver)
128
- return;
129
-
130
126
let receiverTransform = new TransformStream({
131
127
start() {},
132
128
flush() {},
133
- async transform(chunk , controller) {
129
+ async transform(encodedFrame , controller) {
134
130
// Reconstruct the original frame.
135
- let view = new DataView(chunk .data);
131
+ let view = new DataView(encodedFrame .data);
136
132
137
133
// Ignore the last 4 bytes
138
- let newData = new ArrayBuffer(chunk .data.byteLength - 4);
134
+ let newData = new ArrayBuffer(encodedFrame .data.byteLength - 4);
139
135
let newView = new DataView(newData);
140
136
141
137
// Negate all bits in the incoming frame, ignoring the
142
138
// last 4 bytes
143
- for (let i = 0; i < chunk .data.byteLength - 4; ++i)
139
+ for (let i = 0; i < encodedFrame .data.byteLength - 4; ++i)
144
140
newView.setInt8(i, ~view.getInt8(i));
145
141
146
- chunk .data = newData;
147
- controller.enqueue(chunk );
142
+ encodedFrame .data = newData;
143
+ controller.enqueue(encodedFrame );
148
144
},
149
145
});
150
146
151
- let receiverStreams = videoReceiver.createEncodedVideoStreams ();
147
+ let receiverStreams = e.receiver.createEncodedStreams ();
152
148
receiverStreams.readable
153
149
.pipeThrough(receiverTransform)
154
150
.pipeTo(receiverStreams.writable);
@@ -158,7 +154,7 @@ pc.ontrack = e => {
158
154
## API
159
155
160
156
The following are the IDL modifications proposed by this API.
161
- Future iterations will add additional operations following a similar pattern.
157
+ Future iterations may add additional operations following a similar pattern.
162
158
163
159
<pre >
164
160
// New dictionary.
@@ -175,42 +171,103 @@ enum RTCEncodedVideoFrameType {
175
171
"delta",
176
172
};
177
173
174
+ // New dictionaries for video and audio metadata.
175
+ dictionary RTCEncodedVideoFrameMetadata {
176
+ long long frameId;
177
+ sequence< ; long long> ; dependencies;
178
+ unsigned short width;
179
+ unsigned short height;
180
+ long spatialIndex;
181
+ long temporalIndex;
182
+ long synchronizationSource;
183
+ sequence< ; long> ; contributingSources;
184
+ };
185
+
186
+ dictionary RTCEncodedAudioFrameMetadata {
187
+ long synchronizationSource;
188
+ sequence< ; long> ; contributingSources;
189
+ };
190
+
178
191
// New interfaces to define encoded video and audio frames. Will eventually
179
192
// re-use or extend the equivalent defined in WebCodecs.
180
- // The additionalData fields contain metadata about the frame and might be
193
+ // The additionalData fields contain metadata about the frame and will
181
194
// eventually be exposed differently.
182
195
interface RTCEncodedVideoFrame {
183
196
readonly attribute RTCEncodedVideoFrameType type;
184
197
readonly attribute unsigned long long timestamp;
185
198
attribute ArrayBuffer data;
186
- readonly attribute ArrayBuffer additionalData ;
199
+ RTCVideoFrameMetadata getMetadata() ;
187
200
};
188
201
189
202
interface RTCEncodedAudioFrame {
190
203
readonly attribute unsigned long long timestamp;
191
204
attribute ArrayBuffer data;
192
- readonly attribute ArrayBuffer additionalData ;
205
+ RTCAudioFrameMetadata getMetadata() ;
193
206
};
194
207
195
-
196
- // New fields in RTCConfiguration
197
- dictionary RTCConfiguration {
198
- ...
199
- boolean forceEncodedVideoInsertableStreams = false;
200
- boolean forceEncodedAudioInsertableStreams = false;
208
+ // New field in RTCConfiguration
209
+ partial dictionary RTCConfiguration {
210
+ boolean encodedInsertableStreams = false;
201
211
};
202
212
203
213
// New methods for RTCRtpSender and RTCRtpReceiver
204
- interface RTCRtpSender {
205
- // ...
206
- RTCInsertableStreams createEncodedVideoStreams();
207
- RTCInsertableStreams createEncodedAudioStreams();
214
+ partial interface RTCRtpSender {
215
+ RTCInsertableStreams createEncodedStreams();
208
216
};
209
217
210
- interface RTCRtpReceiver {
211
- // ...
212
- RTCInsertableStreams createEncodedVideoStreams();
213
- RTCInsertableStreams createEncodedAudioStreams();
218
+ partial interface RTCRtpReceiver {
219
+ RTCInsertableStreams createEncodedStreams();
214
220
};
215
221
216
222
</pre >
223
+
224
+ ## Design considerations ##
225
+
226
+ This design is built upon the Streams API. This is a natural interface
227
+ for stuff that can be considered a "sequence of objects", and has an ecosystem
228
+ around it that allows some concerns to be handed off easily.
229
+
230
+ In particular:
231
+
232
+ * Sequencing comes naturally; streams are in-order entities.
233
+ * With the Transferable Streams paradigm, changing what thread is doing
234
+ the processing can be done in a manner that has been tested by others.
235
+ * Since other users of Streams interfaces are going to deal with issues
236
+ like efficient handover and WASM interaction, we can expect to leverage
237
+ common solutions for these problems.
238
+
239
+ There are some challenges with the Streams interface:
240
+
241
+ * Queueing in response to backpressure isn't an appropriate reaction in a
242
+ real-time environment. This can be mitigated at the sender by not queueing,
243
+ preferring to discard frames or not generating them.
244
+ * How to interface to congestion control signals, which travel in the
245
+ opposite direction from the streams flow.
246
+ * How to integrate error signalling and recovery, given that most of the
247
+ time, breaking the pipeline is not an appropriate action.
248
+
249
+ These things may be solved by use of non-data "frames" (in the forward direction),
250
+ by reverse streams of non-data "frames" (in the reverse direction), or by defining
251
+ new interfaces based on events, promises or callbacks.
252
+
253
+ Experimentation with the prototype API seems to show that performance is
254
+ adequate for real-time processing; the streaming part is not contributing
255
+ very much to slowing down the pipelines.
256
+
257
+ ## Alternatives to Streams ##
258
+ One set of alternatives involve callback-based or event-based interfaces; those
259
+ would require developing new interfaces that allow the relevant WebRTC
260
+ objects to be visible in the worker context in order to do processing off
261
+ the main thread. This would seem to be a significantly bigger specification
262
+ and implementation effort.
263
+
264
+ Another path would involve specifying a worklet API, similar to the AudioWorklet,
265
+ and specifying new APIs for connecting encoders and decoders to such worklets.
266
+ This also seemed to involve a significantly larger set of new interfaces, with a
267
+ correspondingly larger implementation effort, and would offer less flexibility
268
+ in how the processing elements could be implemented.
269
+
270
+
271
+
272
+
273
+
0 commit comments