-
Notifications
You must be signed in to change notification settings - Fork 124
feat: Support for video-to-animated-image conversion with frame sampling #273
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 1 commit
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -49,6 +49,13 @@ struct avcodec_decoder_struct { | |
| AVCodecContext* codec; | ||
| AVIOContext* avio; | ||
| int video_stream_index; | ||
|
|
||
| // Multi-frame extraction state | ||
| float frame_sample_interval; // Interval between frames in seconds | ||
| double next_frame_time; // Next frame time to extract | ||
| double last_extracted_pts; // Last extracted frame PTS | ||
| int frame_delay_ms; // Delay for current frame in milliseconds | ||
| bool multi_frame_mode; // Whether we're extracting multiple frames | ||
| }; | ||
|
|
||
| static int avcodec_decoder_read_callback(void* d_void, uint8_t* buf, int buf_size) | ||
|
|
@@ -154,7 +161,9 @@ bool avcodec_decoder_is_streamable(const opencv_mat mat) | |
| return false; | ||
| } | ||
|
|
||
| avcodec_decoder avcodec_decoder_create(const opencv_mat buf, const bool hevc_enabled, const bool av1_enabled) | ||
| avcodec_decoder avcodec_decoder_create(const opencv_mat buf, | ||
| const bool hevc_enabled, | ||
| const bool av1_enabled) | ||
| { | ||
| avcodec_decoder d = new struct avcodec_decoder_struct(); | ||
| memset(d, 0, sizeof(struct avcodec_decoder_struct)); | ||
|
|
@@ -286,7 +295,7 @@ int avcodec_decoder_get_icc(const avcodec_decoder d, void* dest, size_t dest_len | |
| if (!d || !d->codec) { | ||
| return -1; | ||
| } | ||
|
|
||
| const uint8_t* profile_data = avcodec_get_icc_profile(d->codec->color_primaries, profile_size); | ||
|
|
||
| if (profile_size > dest_len) { | ||
|
|
@@ -413,7 +422,7 @@ const char* avcodec_decoder_get_video_codec(const avcodec_decoder d) | |
| if (!d || !d->codec) { | ||
| return "Unknown"; | ||
| } | ||
|
|
||
| switch (d->codec->codec_id) { | ||
| case AV_CODEC_ID_H264: | ||
| return "H264"; | ||
|
|
@@ -437,7 +446,7 @@ const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d) | |
| if (!d || !d->container) { | ||
| return "Unknown"; | ||
| } | ||
|
|
||
| for (unsigned int i = 0; i < d->container->nb_streams; i++) { | ||
| AVStream* stream = d->container->streams[i]; | ||
| if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) { | ||
|
|
@@ -457,7 +466,7 @@ const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d) | |
| } | ||
| } | ||
| } | ||
|
|
||
| return "Unknown"; | ||
| } | ||
|
|
||
|
|
@@ -472,40 +481,59 @@ bool avcodec_decoder_has_subtitles(const avcodec_decoder d) | |
| return false; | ||
| } | ||
|
|
||
| static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame) | ||
| static int avcodec_decoder_convert_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame) | ||
| { | ||
| if (!d || !d->codec || !d->codec->codec || !mat || !frame) { | ||
| return -1; | ||
| } | ||
|
|
||
| auto cvMat = static_cast<cv::Mat*>(mat); | ||
| if (!cvMat) { | ||
| return -1; | ||
| } | ||
|
|
||
| int res = avcodec_receive_frame(d->codec, frame); | ||
| if (res >= 0) { | ||
| // Calculate the step size based on the cv::Mat's width | ||
| int stepSize = | ||
| 4 * cvMat->cols; // Assuming the cv::Mat is in BGRA format, which has 4 channels | ||
| if (cvMat->cols % 32 != 0) { | ||
| int width = cvMat->cols + 32 - (cvMat->cols % 32); | ||
| stepSize = 4 * width; | ||
| // Check if rotation/transformation is needed | ||
| CVImageOrientation orientation = (CVImageOrientation)avcodec_decoder_get_orientation(d); | ||
| bool needs_transformation = (orientation != CV_IMAGE_ORIENTATION_TL); | ||
| bool dimensions_swapped = (orientation == CV_IMAGE_ORIENTATION_RT || orientation == CV_IMAGE_ORIENTATION_LB); | ||
|
|
||
| int res = 0; | ||
| cv::Mat tempMat; | ||
| cv::Mat* decodeDst = cvMat; | ||
|
|
||
| // If transformation is needed, decode to a temporary buffer with raw dimensions | ||
| if (needs_transformation) { | ||
| // For 90/270 rotation, create temp Mat with swapped dimensions | ||
| // For other transformations, use same dimensions as output | ||
| if (dimensions_swapped) { | ||
| tempMat = cv::Mat(frame->height, frame->width, CV_8UC4); | ||
| } else { | ||
| tempMat = cv::Mat(cvMat->rows, cvMat->cols, CV_8UC4); | ||
| } | ||
| if (!opencv_mat_set_row_stride(mat, stepSize)) { | ||
| decodeDst = &tempMat; | ||
| } | ||
|
|
||
| { | ||
| // Use the decode destination's actual step (stride) | ||
| int stepSize = decodeDst->step; | ||
|
|
||
| // Validate that the stride and height are within the allocated buffer bounds | ||
| size_t required_size = stepSize * decodeDst->rows; | ||
| size_t available_size = | ||
| (decodeDst->datalimit && decodeDst->data) ? (decodeDst->datalimit - decodeDst->data) : 0; | ||
| if (available_size > 0 && required_size > available_size) { | ||
| return -1; | ||
| } | ||
|
|
||
| // Create SwsContext for converting the frame format and scaling | ||
| // Create SwsContext for converting the frame format | ||
| struct SwsContext* sws = | ||
| sws_getContext(frame->width, | ||
| frame->height, | ||
| (AVPixelFormat)(frame->format), // Source dimensions and format | ||
| cvMat->cols, | ||
| cvMat->rows, | ||
| AV_PIX_FMT_BGRA, // Destination dimensions and format | ||
| SWS_BILINEAR, // Specify the scaling algorithm; you can choose another | ||
| // according to your needs | ||
| decodeDst->cols, | ||
| decodeDst->rows, | ||
| AV_PIX_FMT_BGRA, // Destination format | ||
| SWS_BILINEAR, | ||
| NULL, | ||
| NULL, | ||
| NULL); | ||
|
|
@@ -541,9 +569,8 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A | |
| sws_setColorspaceDetails(sws, inv_table, srcRange, table, 1, 0, 1 << 16, 1 << 16); | ||
|
|
||
| // The linesizes and data pointers for the destination | ||
| int dstLinesizes[4]; | ||
| av_image_fill_linesizes(dstLinesizes, AV_PIX_FMT_BGRA, stepSize / 4); | ||
| uint8_t* dstData[4] = {cvMat->data, NULL, NULL, NULL}; | ||
| int dstLinesizes[4] = {stepSize, 0, 0, 0}; | ||
| uint8_t* dstData[4] = {decodeDst->data, NULL, NULL, NULL}; | ||
|
|
||
| // Perform the scaling and format conversion | ||
| sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dstData, dstLinesizes); | ||
|
|
@@ -552,6 +579,28 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A | |
| sws_freeContext(sws); | ||
| } | ||
|
|
||
| // Apply orientation transformation if needed | ||
| if (needs_transformation) { | ||
| cv::OrientationTransform(int(orientation), tempMat); | ||
|
|
||
| // Verify dimensions match after transformation | ||
| if (tempMat.cols != cvMat->cols || tempMat.rows != cvMat->rows) { | ||
| return -1; | ||
| } | ||
|
|
||
| // Copy the transformed image to the output Mat, respecting stride | ||
| opencv_mat_copy_with_stride(&tempMat, cvMat); | ||
| } | ||
|
|
||
| return res; | ||
| } | ||
|
|
||
| static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame) | ||
| { | ||
| int res = avcodec_receive_frame(d->codec, frame); | ||
| if (res >= 0) { | ||
| return avcodec_decoder_convert_frame(d, mat, frame); | ||
| } | ||
| return res; | ||
| } | ||
|
|
||
|
|
@@ -573,12 +622,106 @@ static int avcodec_decoder_decode_packet(const avcodec_decoder d, opencv_mat mat | |
| return res; | ||
| } | ||
|
|
||
| bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat) | ||
| // Helper function to check if a frame should be extracted and process it | ||
| // Returns true if the frame was processed and should be returned | ||
| static bool avcodec_decoder_process_sampled_frame(avcodec_decoder d, | ||
| opencv_mat mat, | ||
| AVFrame* frame, | ||
| AVStream* video_stream, | ||
| int* out_result) | ||
| { | ||
| double frame_time = -1.0; | ||
| if (frame->pts != AV_NOPTS_VALUE) { | ||
| frame_time = frame->pts * av_q2d(video_stream->time_base); | ||
| } | ||
|
|
||
| // Check if this frame should be extracted based on sampling interval | ||
| if (frame_time >= 0 && frame_time >= d->next_frame_time) { | ||
| // Calculate frame delay for animation | ||
| if (d->last_extracted_pts >= 0) { | ||
| double delay_seconds = frame_time - d->last_extracted_pts; | ||
| int delay_ms = (int)(delay_seconds * 1000.0); | ||
| // Validate delay is reasonable (between 1ms and 60 seconds) | ||
| // Use sample interval if delay is out of bounds | ||
| if (delay_ms > 0 && delay_ms <= 60000) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. is this the output delay per frame? if so arguably 1ms is pretty low. What's a reasonable default?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good point, yeah, in practice we'll use something like 100-200ms. I don't think we want to have a default since the feature is enabled by specifying a positive non-zero value. |
||
| d->frame_delay_ms = delay_ms; | ||
| } else { | ||
| d->frame_delay_ms = (int)(d->frame_sample_interval * 1000.0); | ||
| } | ||
| } else { | ||
| d->frame_delay_ms = (int)(d->frame_sample_interval * 1000.0); | ||
| } | ||
|
|
||
| d->last_extracted_pts = frame_time; | ||
| d->next_frame_time = frame_time + d->frame_sample_interval; | ||
|
|
||
| // Convert frame to output mat | ||
| *out_result = avcodec_decoder_convert_frame(d, mat, frame); | ||
| return true; | ||
| } | ||
|
|
||
| return false; | ||
| } | ||
|
|
||
| bool avcodec_decoder_decode(avcodec_decoder d, opencv_mat mat) | ||
| { | ||
| if (!d || !d->container || !d->codec || !mat) { | ||
| return false; | ||
| } | ||
|
|
||
| AVPacket packet; | ||
| AVStream* video_stream = d->container->streams[d->video_stream_index]; | ||
|
|
||
| // If we're in multi-frame mode, we need to sample frames based on time | ||
| if (d->multi_frame_mode) { | ||
| AVFrame* frame = av_frame_alloc(); | ||
| if (!frame) { | ||
| return false; | ||
| } | ||
|
|
||
| while (true) { | ||
| int res = av_read_frame(d->container, &packet); | ||
| if (res < 0) { | ||
| // Reached EOF - flush remaining frames from the decoder | ||
| avcodec_send_packet(d->codec, NULL); | ||
| while (avcodec_receive_frame(d->codec, frame) == 0) { | ||
| int convert_result; | ||
| if (avcodec_decoder_process_sampled_frame(d, mat, frame, video_stream, &convert_result)) { | ||
| av_frame_free(&frame); | ||
| return (convert_result >= 0); | ||
| } | ||
| av_frame_unref(frame); | ||
| } | ||
|
|
||
| // No more frames available | ||
| av_frame_free(&frame); | ||
| return false; | ||
| } | ||
|
|
||
| if (packet.stream_index != d->video_stream_index) { | ||
| av_packet_unref(&packet); | ||
| continue; | ||
| } | ||
|
|
||
| res = avcodec_send_packet(d->codec, &packet); | ||
| av_packet_unref(&packet); | ||
|
|
||
| if (res < 0) { | ||
| continue; | ||
| } | ||
|
|
||
| while (avcodec_receive_frame(d->codec, frame) == 0) { | ||
| int convert_result; | ||
| if (avcodec_decoder_process_sampled_frame(d, mat, frame, video_stream, &convert_result)) { | ||
| av_frame_free(&frame); | ||
| return (convert_result >= 0); | ||
| } | ||
| av_frame_unref(frame); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // Single-frame mode: just decode the first video frame | ||
| bool done = false; | ||
| bool success = false; | ||
| while (!done) { | ||
|
|
@@ -601,6 +744,26 @@ bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat) | |
| return success; | ||
| } | ||
|
|
||
| void avcodec_decoder_set_frame_sample_interval(avcodec_decoder d, float interval_seconds) | ||
| { | ||
| if (!d) { | ||
| return; | ||
| } | ||
| d->frame_sample_interval = interval_seconds; | ||
| d->next_frame_time = 0.0; | ||
| d->last_extracted_pts = -1.0; | ||
| d->frame_delay_ms = 0; | ||
| d->multi_frame_mode = (interval_seconds > 0.0); | ||
| } | ||
|
|
||
| int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d) | ||
| { | ||
| if (!d) { | ||
| return 0; | ||
| } | ||
| return d->frame_delay_ms; | ||
| } | ||
|
|
||
| void avcodec_decoder_release(avcodec_decoder d) | ||
| { | ||
| if (d->codec) { | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
my instinct is to use integers for exact precision unless there's a specific need for floats.