@@ -93,7 +93,8 @@ class BetaCudaDeviceInterface : public DeviceInterface {
93
93
94
94
} // namespace facebook::torchcodec
95
95
96
- // Note: [sendPacket, receiveFrame, frame ordering and NVCUVID callbacks]
96
+ /* clang-format off */
97
+ // Note: [General design, sendPacket, receiveFrame, frame ordering and NVCUVID callbacks]
97
98
//
98
99
// At a high level, this decoding interface mimics the FFmpeg send/receive
99
100
// architecture:
@@ -120,46 +121,63 @@ class BetaCudaDeviceInterface : public DeviceInterface {
120
121
// - B1 is a B-frame (bi-directional) which needs both I0 and P2 to be decoded
121
122
// - P2 is a P-frame (predicted frame) which only needs I0 to be decodec.
122
123
//
123
- // Because B1 needs both I0 and P2 to be properly decoded, the decode order must
124
- // be: I0 P2 B1 P4 B3 ... which is different from the display order.
124
+ // Because B1 needs both I0 and P2 to be properly decoded, the decode order
125
+ // (packet order), defined by the encoder, must be: I0 P2 B1 P4 B3 ... which is
126
+ // different from the display order.
125
127
//
126
- // We don't have to worry about the decode order: it's up to the parser to
127
- // figure that out. But we have to make sure that receiveFrame() returns frames
128
- // in display order.
129
- //
130
- // SendPacket(AVPacket)'s job is just to send the packet to the NVCUVID parser
131
- // by calling cuvidParseVideoData(packet). When cuvidParseVideoData(packet) is
132
- // called, it may trigger callbacks, particularly:
128
+ // SendPacket(AVPacket)'s job is just to pass down the packet to the NVCUVID
129
+ // parser by calling cuvidParseVideoData(packet). When
130
+ // cuvidParseVideoData(packet) is called, it may trigger callbacks,
131
+ // particularly:
132
+ // - streamPropertyChange(videoFormat): triggered once at the start of the
133
+ // stream, and possibly later if the stream properties change (e.g.
134
+ // resolution).
133
135
// - frameReadyForDecoding(picParams)): triggered **in decode order** when the
134
136
// parser has accumulated enough data to decode a frame. We send that frame to
135
- // the NVDEC hardware for **async** decoding. While that frame is being
136
- // decoded, we store a light reference (a Slot) to that frame in the
137
- // readyFrames_, and mark that slot as BEING_DECODED. The value that uniquely
138
- // identifies that frame in the readyFrames_ is its "slotId", which is given
139
- // to us by NVCUVID in the callback parameter: picParams->CurrPicIdx.
137
+ // the NVDEC hardware for **async** decoding.
140
138
// - frameReadyInDisplayOrder(dispInfo)): triggered **in display order** when a
141
- // frame is ready to be "displayed" (returned). When it is triggered, we look
142
- // up the corresponding frame/slot in the readyFrames_, using
143
- // dispInfo->picture_index to match it against a given BEING_DECODED slotId.
144
- // We mark that frame/slot as READY_FOR_OUTPUT.
145
- // Crucially, this callback also tells us the pts of that frame. We store
146
- // the pts and other relevant info the slot.
147
- //
148
- // Said differently, from the perspective of the readyFrames_, at any point in
149
- // time a slot/frame in the readyFrames_ can be in 3 states:
150
- // - empty: no slot for that slotId exists in the readyFrames_
151
- // - BEING_DECODED: frameReadyForDecoding was triggered for that frame, and the
152
- // frame was sent to NVDEC for async decoding. We don't know its pts because
153
- // the parser didn't trigger frameReadyInDisplayOrder() for that frame yet.
154
- // - READY_FOR_OUTPUT: frameReadyInDisplayOrder was triggered for that frame, it
155
- // is decoded and ready to be mapped and returned. We know its pts.
156
- //
157
- // Because frameReadyInDisplayOrder is triggered in display order, we know that
158
- // if a slot is READY_FOR_OUTPUT, then all frames with a lower pts are also
159
- // READY_FOR_OUTPUT, or already returned. So when receiveFrame() is called, we
160
- // just need to look for the READY_FOR_OUTPUT slot with the lowest pts, and
161
- // return that frame. This guarantees that receiveFrame() returns frames in
162
- // display order. If no slot is READY_FOR_OUTPUT, then we return EAGAIN to
163
- // indicate the caller should send more packets.
164
- //
165
- // Simple, innit?
139
+ // frame is ready to be "displayed" (returned). At that point, the parser also
140
+ // gives us the pts of that frame. We store (a reference to) that frame in a
141
+ // FIFO queue: readyFrames_.
142
+ //
143
+ // When receiveFrame(AVFrame) is called, if readyFrames_ is not empty, we pop
144
+ // the front of the queue, which is the next frame in display order, and map it
145
+ // to an AVFrame by calling cuvidMapVideoFrame(). If readyFrames_ is empty we
146
+ // return EAGAIN to indicate the caller should send more packets.
147
+ //
148
+ // There is potentially a small inefficiency due to the callback design: in
149
+ // order for us to know that a frame is ready in display order, we need the
150
+ // frameReadyInDisplayOrder callback to be triggered. This can only happen
151
+ // within cuvidParseVideoData(packet) in sendPacket(). This means there may be
152
+ // the following sequence of calls:
153
+ //
154
+ // sendPacket(relevantAVPacket)
155
+ // cuvidParseVideoData(relevantAVPacket)
156
+ // frameReadyForDecoding()
157
+ // cuvidDecodePicture() Send frame to NVDEC for async decoding
158
+ //
159
+ // receiveFrame() -> EAGAIN Frame is potentially already decoded
160
+ // and could be returned, but we don't
161
+ // know because frameReadyInDisplayOrder
162
+ // hasn't been triggered yet. We'll only
163
+ // know after sending another,
164
+ // potentially irrelevant packet.
165
+ //
166
+ // sendPacket(irrelevantAVPacket)
167
+ // cuvidParseVideoData(irrelevantAVPacket)
168
+ // frameReadyInDisplayOrder() Only now do we know that our target
169
+ // frame is ready.
170
+ //
171
+ // receiveFrame() return target frame
172
+ //
173
+ // How much this matters in practice is unclear, but probably negligible in
174
+ // general. Particularly when frames are decoded consecutively anyway, the
175
+ // "irrelevantPacket" is actually relevant for a future target frame.
176
+ //
177
+ // Note that the alternative is to *not* rely on the frameReadyInDisplayOrder
178
+ // callback. It's technically possible, but it would mean we now have to solve
179
+ // two hard, *codec-dependent* problems that the callback was solving for us:
180
+ // - we have to guess the frame's pts ourselves
181
+ // - we have to re-order the frames ourselves to preserve display order.
182
+ //
183
+ /* clang-format on */
0 commit comments