diff --git a/avcodec.cpp b/avcodec.cpp index 5d480dfe..951f0b44 100644 --- a/avcodec.cpp +++ b/avcodec.cpp @@ -49,6 +49,13 @@ struct avcodec_decoder_struct { AVCodecContext* codec; AVIOContext* avio; int video_stream_index; + + // Multi-frame extraction state + int frame_sample_interval_ms; // Interval between frames in milliseconds + double next_frame_time; // Next frame time to extract + double last_extracted_pts; // Last extracted frame PTS + int frame_delay_ms; // Delay for current frame in milliseconds + bool multi_frame_mode; // Whether we're extracting multiple frames }; static int avcodec_decoder_read_callback(void* d_void, uint8_t* buf, int buf_size) @@ -154,7 +161,9 @@ bool avcodec_decoder_is_streamable(const opencv_mat mat) return false; } -avcodec_decoder avcodec_decoder_create(const opencv_mat buf, const bool hevc_enabled, const bool av1_enabled) +avcodec_decoder avcodec_decoder_create(const opencv_mat buf, + const bool hevc_enabled, + const bool av1_enabled) { avcodec_decoder d = new struct avcodec_decoder_struct(); memset(d, 0, sizeof(struct avcodec_decoder_struct)); @@ -286,7 +295,7 @@ int avcodec_decoder_get_icc(const avcodec_decoder d, void* dest, size_t dest_len if (!d || !d->codec) { return -1; } - + const uint8_t* profile_data = avcodec_get_icc_profile(d->codec->color_primaries, profile_size); if (profile_size > dest_len) { @@ -413,7 +422,7 @@ const char* avcodec_decoder_get_video_codec(const avcodec_decoder d) if (!d || !d->codec) { return "Unknown"; } - + switch (d->codec->codec_id) { case AV_CODEC_ID_H264: return "H264"; @@ -437,7 +446,7 @@ const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d) if (!d || !d->container) { return "Unknown"; } - + for (unsigned int i = 0; i < d->container->nb_streams; i++) { AVStream* stream = d->container->streams[i]; if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO) { @@ -457,7 +466,7 @@ const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d) } } } - + return "Unknown"; } @@ -472,40 +481,59 @@ bool avcodec_decoder_has_subtitles(const avcodec_decoder d) return false; } -static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame) +static int avcodec_decoder_convert_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame) { if (!d || !d->codec || !d->codec->codec || !mat || !frame) { return -1; } - + auto cvMat = static_cast(mat); if (!cvMat) { return -1; } - int res = avcodec_receive_frame(d->codec, frame); - if (res >= 0) { - // Calculate the step size based on the cv::Mat's width - int stepSize = - 4 * cvMat->cols; // Assuming the cv::Mat is in BGRA format, which has 4 channels - if (cvMat->cols % 32 != 0) { - int width = cvMat->cols + 32 - (cvMat->cols % 32); - stepSize = 4 * width; + // Check if rotation/transformation is needed + CVImageOrientation orientation = (CVImageOrientation)avcodec_decoder_get_orientation(d); + bool needs_transformation = (orientation != CV_IMAGE_ORIENTATION_TL); + bool dimensions_swapped = (orientation == CV_IMAGE_ORIENTATION_RT || orientation == CV_IMAGE_ORIENTATION_LB); + + int res = 0; + cv::Mat tempMat; + cv::Mat* decodeDst = cvMat; + + // If transformation is needed, decode to a temporary buffer with raw dimensions + if (needs_transformation) { + // For 90/270 rotation, create temp Mat with swapped dimensions + // For other transformations, use same dimensions as output + if (dimensions_swapped) { + tempMat = cv::Mat(frame->height, frame->width, CV_8UC4); + } else { + tempMat = cv::Mat(cvMat->rows, cvMat->cols, CV_8UC4); } - if (!opencv_mat_set_row_stride(mat, stepSize)) { + decodeDst = &tempMat; + } + + { + // Use the decode destination's actual step (stride) + int stepSize = decodeDst->step; + + // Validate that the stride and height are within the allocated buffer bounds + size_t required_size = stepSize * decodeDst->rows; + size_t available_size = + (decodeDst->datalimit && decodeDst->data) ? (decodeDst->datalimit - decodeDst->data) : 0; + if (available_size > 0 && required_size > available_size) { return -1; } - // Create SwsContext for converting the frame format and scaling + // Create SwsContext for converting the frame format struct SwsContext* sws = sws_getContext(frame->width, frame->height, (AVPixelFormat)(frame->format), // Source dimensions and format - cvMat->cols, - cvMat->rows, - AV_PIX_FMT_BGRA, // Destination dimensions and format - SWS_BILINEAR, // Specify the scaling algorithm; you can choose another - // according to your needs + decodeDst->cols, + decodeDst->rows, + AV_PIX_FMT_BGRA, // Destination format + SWS_BILINEAR, NULL, NULL, NULL); @@ -541,9 +569,8 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A sws_setColorspaceDetails(sws, inv_table, srcRange, table, 1, 0, 1 << 16, 1 << 16); // The linesizes and data pointers for the destination - int dstLinesizes[4]; - av_image_fill_linesizes(dstLinesizes, AV_PIX_FMT_BGRA, stepSize / 4); - uint8_t* dstData[4] = {cvMat->data, NULL, NULL, NULL}; + int dstLinesizes[4] = {stepSize, 0, 0, 0}; + uint8_t* dstData[4] = {decodeDst->data, NULL, NULL, NULL}; // Perform the scaling and format conversion sws_scale(sws, frame->data, frame->linesize, 0, frame->height, dstData, dstLinesizes); @@ -552,6 +579,28 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A sws_freeContext(sws); } + // Apply orientation transformation if needed + if (needs_transformation) { + cv::OrientationTransform(int(orientation), tempMat); + + // Verify dimensions match after transformation + if (tempMat.cols != cvMat->cols || tempMat.rows != cvMat->rows) { + return -1; + } + + // Copy the transformed image to the output Mat, respecting stride + opencv_mat_copy_with_stride(&tempMat, cvMat); + } + + return res; +} + +static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame) +{ + int res = avcodec_receive_frame(d->codec, frame); + if (res >= 0) { + return avcodec_decoder_convert_frame(d, mat, frame); + } return res; } @@ -573,12 +622,106 @@ static int avcodec_decoder_decode_packet(const avcodec_decoder d, opencv_mat mat return res; } -bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat) +// Helper function to check if a frame should be extracted and process it +// Returns true if the frame was processed and should be returned +static bool avcodec_decoder_process_sampled_frame(avcodec_decoder d, + opencv_mat mat, + AVFrame* frame, + AVStream* video_stream, + int* out_result) +{ + double frame_time = -1.0; + if (frame->pts != AV_NOPTS_VALUE) { + frame_time = frame->pts * av_q2d(video_stream->time_base); + } + + // Check if this frame should be extracted based on sampling interval + if (frame_time >= 0 && frame_time >= d->next_frame_time) { + // Calculate frame delay for animation + if (d->last_extracted_pts >= 0) { + double delay_seconds = frame_time - d->last_extracted_pts; + int delay_ms = (int)(delay_seconds * 1000.0); + // Validate delay is reasonable (between 1ms and 60 seconds) + // Use sample interval if delay is out of bounds + if (delay_ms > 0 && delay_ms <= 60000) { + d->frame_delay_ms = delay_ms; + } else { + d->frame_delay_ms = (int)(d->frame_sample_interval_ms); + } + } else { + d->frame_delay_ms = (int)(d->frame_sample_interval_ms); + } + + d->last_extracted_pts = frame_time; + d->next_frame_time = frame_time + (d->frame_sample_interval_ms / 1000.0); + + // Convert frame to output mat + *out_result = avcodec_decoder_convert_frame(d, mat, frame); + return true; + } + + return false; +} + +bool avcodec_decoder_decode(avcodec_decoder d, opencv_mat mat) { if (!d || !d->container || !d->codec || !mat) { return false; } + AVPacket packet; + AVStream* video_stream = d->container->streams[d->video_stream_index]; + + // If we're in multi-frame mode, we need to sample frames based on time + if (d->multi_frame_mode) { + AVFrame* frame = av_frame_alloc(); + if (!frame) { + return false; + } + + while (true) { + int res = av_read_frame(d->container, &packet); + if (res < 0) { + // Reached EOF - flush remaining frames from the decoder + avcodec_send_packet(d->codec, NULL); + while (avcodec_receive_frame(d->codec, frame) == 0) { + int convert_result; + if (avcodec_decoder_process_sampled_frame(d, mat, frame, video_stream, &convert_result)) { + av_frame_free(&frame); + return (convert_result >= 0); + } + av_frame_unref(frame); + } + + // No more frames available + av_frame_free(&frame); + return false; + } + + if (packet.stream_index != d->video_stream_index) { + av_packet_unref(&packet); + continue; + } + + res = avcodec_send_packet(d->codec, &packet); + av_packet_unref(&packet); + + if (res < 0) { + continue; + } + + while (avcodec_receive_frame(d->codec, frame) == 0) { + int convert_result; + if (avcodec_decoder_process_sampled_frame(d, mat, frame, video_stream, &convert_result)) { + av_frame_free(&frame); + return (convert_result >= 0); + } + av_frame_unref(frame); + } + } + } + + // Single-frame mode: just decode the first video frame bool done = false; bool success = false; while (!done) { @@ -601,6 +744,26 @@ bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat) return success; } +void avcodec_decoder_set_frame_sample_interval_ms(avcodec_decoder d, int frame_sample_interval_ms) +{ + if (!d) { + return; + } + d->frame_sample_interval_ms = frame_sample_interval_ms; + d->next_frame_time = 0.0; + d->last_extracted_pts = -1.0; + d->frame_delay_ms = 0; + d->multi_frame_mode = (frame_sample_interval_ms > 0); +} + +int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d) +{ + if (!d) { + return 0; + } + return d->frame_delay_ms; +} + void avcodec_decoder_release(avcodec_decoder d) { if (d->codec) { diff --git a/avcodec.go b/avcodec.go index db37f3c8..29f59118 100644 --- a/avcodec.go +++ b/avcodec.go @@ -22,13 +22,15 @@ var av1Enabled string // avCodecDecoder handles decoding of various video/image formats using FFmpeg's avcodec. type avCodecDecoder struct { - decoder C.avcodec_decoder - mat C.opencv_mat - buf []byte - hasDecoded bool - maybeMP4 bool - isStreamable bool - hasSubtitles bool + decoder C.avcodec_decoder + mat C.opencv_mat + buf []byte + hasDecoded bool + maybeMP4 bool + isStreamable bool + hasSubtitles bool + multiFrameMode bool + frameSampleIntervalMs int } // newAVCodecDecoder creates a new decoder instance from the provided buffer. @@ -130,14 +132,23 @@ func (d *avCodecDecoder) Duration() time.Duration { } // Header returns the image metadata including dimensions, pixel format, and orientation. -// Frame count is always 1 since it requires the entire buffer to be decoded. +// Frame count is 1 for single-frame mode, or estimated from duration and sample interval in multi-frame mode. func (d *avCodecDecoder) Header() (*ImageHeader, error) { + numFrames := 1 + if d.multiFrameMode && d.frameSampleIntervalMs > 0 { + // Estimate the number of frames based on duration and sample interval + duration := float64(C.avcodec_decoder_get_duration(d.decoder)) + if duration > 0 { + numFrames = int(duration/(float64(d.frameSampleIntervalMs)/1000.0)) + 1 + } + } + return &ImageHeader{ width: int(C.avcodec_decoder_get_width(d.decoder)), height: int(C.avcodec_decoder_get_height(d.decoder)), pixelType: PixelType(C.CV_8UC4), orientation: ImageOrientation(C.avcodec_decoder_get_orientation(d.decoder)), - numFrames: 1, + numFrames: numFrames, contentLength: len(d.buf), }, nil } @@ -145,7 +156,8 @@ func (d *avCodecDecoder) Header() (*ImageHeader, error) { // DecodeTo decodes the next frame into the provided Framebuffer. // Returns io.EOF when no more frames are available. func (d *avCodecDecoder) DecodeTo(f *Framebuffer) error { - if d.hasDecoded { + // In single-frame mode, only decode once + if !d.multiFrameMode && d.hasDecoded { return io.EOF } h, err := d.Header() @@ -156,16 +168,32 @@ func (d *avCodecDecoder) DecodeTo(f *Framebuffer) error { if err != nil { return err } + + // Call decode - it handles both single-frame and multi-frame modes internally ret := C.avcodec_decoder_decode(d.decoder, f.mat) if !ret { - return ErrDecodingFailed + return io.EOF } + + // Set frame properties + if d.multiFrameMode { + // Get the frame delay from the decoder + frameDelayMs := int(C.avcodec_decoder_get_frame_delay_ms(d.decoder)) + // Validate delay is reasonable (WebP supports up to 65535ms per frame) + // Fall back to sample interval if delay is invalid + if frameDelayMs <= 0 || frameDelayMs > 65535 { + frameDelayMs = d.frameSampleIntervalMs + } + f.duration = time.Duration(frameDelayMs) * time.Millisecond + } else { + f.duration = time.Duration(0) + d.hasDecoded = true + } + f.blend = NoBlend f.dispose = DisposeToBackgroundColor - f.duration = time.Duration(0) f.xOffset = 0 f.yOffset = 0 - d.hasDecoded = true return nil } @@ -174,6 +202,14 @@ func (d *avCodecDecoder) SkipFrame() error { return ErrSkipNotSupported } +// SetFrameSampleInterval configures the decoder to extract frames at the specified +// interval in seconds. This enables multi-frame extraction mode for videos. +func (d *avCodecDecoder) SetFrameSampleInterval(frameSampleIntervalMs int) { + d.multiFrameMode = true + d.frameSampleIntervalMs = frameSampleIntervalMs + C.avcodec_decoder_set_frame_sample_interval_ms(d.decoder, C.int(frameSampleIntervalMs)) +} + // Close releases all resources associated with the decoder. func (d *avCodecDecoder) Close() { C.avcodec_decoder_release(d.decoder) diff --git a/avcodec.hpp b/avcodec.hpp index 38efde54..1d2fe3ce 100644 --- a/avcodec.hpp +++ b/avcodec.hpp @@ -11,19 +11,23 @@ typedef struct avcodec_decoder_struct* avcodec_decoder; void avcodec_init(); -avcodec_decoder avcodec_decoder_create(const opencv_mat buf, const bool hevc_enabled, const bool av1_enabled); +avcodec_decoder avcodec_decoder_create(const opencv_mat buf, + const bool hevc_enabled, + const bool av1_enabled); void avcodec_decoder_release(avcodec_decoder d); int avcodec_decoder_get_width(const avcodec_decoder d); int avcodec_decoder_get_height(const avcodec_decoder d); int avcodec_decoder_get_orientation(const avcodec_decoder d); float avcodec_decoder_get_duration(const avcodec_decoder d); -bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat); +bool avcodec_decoder_decode(avcodec_decoder d, opencv_mat mat); bool avcodec_decoder_is_streamable(const opencv_mat buf); bool avcodec_decoder_has_subtitles(const avcodec_decoder d); const char* avcodec_decoder_get_description(const avcodec_decoder d); const char* avcodec_decoder_get_video_codec(const avcodec_decoder d); const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d); int avcodec_decoder_get_icc(const avcodec_decoder d, void* dest, size_t dest_len); +void avcodec_decoder_set_frame_sample_interval_ms(avcodec_decoder d, const int frame_sample_interval_ms); +int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d); #ifdef __cplusplus } diff --git a/giflib.cpp b/giflib.cpp index 3120d390..28fd3aa7 100644 --- a/giflib.cpp +++ b/giflib.cpp @@ -484,7 +484,7 @@ static bool giflib_decoder_render_frame(giflib_decoder d, GraphicsControlBlock* // Save current frame content before drawing new frame if (d->have_read_first_frame) { - memcpy(d->prev_frame_bgra.data(), cvMat->data, buf_width * buf_height * BYTES_PER_PIXEL); + opencv_mat_copy_to_packed_buffer(mat, d->prev_frame_bgra.data(), buf_width, buf_height, BYTES_PER_PIXEL); } // Draw the new frame @@ -1083,7 +1083,7 @@ static bool giflib_encoder_render_frame(giflib_encoder e, } // XXX change this if we do partial frames (only copy over some) - memcpy(e->prev_frame_bgra, frame->data, 4 * e->gif->SWidth * e->gif->SHeight); + opencv_mat_copy_to_packed_buffer((opencv_mat)frame, e->prev_frame_bgra, e->gif->SWidth, e->gif->SHeight, BYTES_PER_PIXEL); e->prev_frame_color_map = color_map; e->prev_frame_disposal = gcb.DisposalMode; diff --git a/lilliput.go b/lilliput.go index a1c09e96..5557ba64 100644 --- a/lilliput.go +++ b/lilliput.go @@ -80,6 +80,16 @@ type Decoder interface { AudioCodec() string } +// VideoDecoder extends Decoder with video-specific functionality +type VideoDecoder interface { + Decoder + + // SetFrameSampleInterval configures the decoder to extract frames at the specified + // interval in seconds. For example, 0.1 means extract a frame every 100ms (10 FPS). + // This enables multi-frame extraction mode. + SetFrameSampleInterval(intervalMs int) +} + // An Encoder compresses raw pixel data into a well-known image type. type Encoder interface { // Encode encodes the pixel data in f into the dst provided to NewEncoder. Encode quality diff --git a/opencv.cpp b/opencv.cpp index 5ddd6661..13b73f77 100644 --- a/opencv.cpp +++ b/opencv.cpp @@ -29,6 +29,17 @@ opencv_mat opencv_mat_create_from_data(int width, int height, int type, void* da return mat; } +opencv_mat opencv_mat_create_from_data_with_stride(int width, int height, int type, void* data, size_t data_len, size_t step) +{ + size_t min_size = step * height; + if (min_size > data_len) { + return NULL; + } + auto mat = new cv::Mat(height, width, type, data, step); + mat->datalimit = (uint8_t*)data + data_len; + return mat; +} + opencv_mat opencv_mat_create_empty_from_data(int length, void* data) { // this is slightly sketchy - what we're going to do is build a 1x0 matrix @@ -214,6 +225,39 @@ void opencv_mat_orientation_transform(CVImageOrientation orientation, opencv_mat cv::OrientationTransform(int(orientation), *cvMat); } +void opencv_mat_copy_with_stride(const opencv_mat src, opencv_mat dst) +{ + auto srcMat = static_cast(src); + auto dstMat = static_cast(dst); + + // Verify dimensions match + if (srcMat->rows != dstMat->rows || srcMat->cols != dstMat->cols) { + return; + } + + // Copy row by row to handle different strides + size_t rowBytes = srcMat->cols * srcMat->elemSize(); + for (int y = 0; y < srcMat->rows; y++) { + memcpy(dstMat->data + y * dstMat->step, + srcMat->data + y * srcMat->step, + rowBytes); + } +} + +void opencv_mat_copy_to_packed_buffer(const opencv_mat src, void* dst_buffer, int width, int height, int channels) +{ + auto srcMat = static_cast(src); + auto dst = static_cast(dst_buffer); + + // Copy row by row from cv::Mat (with stride) to packed buffer (no stride) + size_t rowBytes = width * channels; + for (int y = 0; y < height; y++) { + memcpy(dst + y * rowBytes, + srcMat->data + y * srcMat->step, + rowBytes); + } +} + int opencv_mat_get_width(const opencv_mat mat) { auto cvMat = static_cast(mat); diff --git a/opencv.go b/opencv.go index 655f3290..d230228f 100644 --- a/opencv.go +++ b/opencv.go @@ -178,6 +178,12 @@ func (h *ImageHeader) IsAnimated() bool { return h.numFrames > 1 } +// NumFrames returns the number of frames in the image. +// Returns 1 for static images, >1 for animations. +func (h *ImageHeader) NumFrames() int { + return h.numFrames +} + // HasAlpha returns true if the image has an alpha channel. func (h *ImageHeader) HasAlpha() bool { return h.pixelType.Channels() == 4 @@ -191,10 +197,34 @@ func (h *ImageHeader) ContentLength() int { return h.contentLength } +// calculateAlignedStride computes a 32-byte aligned stride for the given width and channel count. +// This alignment is crucial for SIMD performance. +// For example, with 4 channels (BGRA): 8 pixels * 4 bytes = 32 bytes alignment. +func calculateAlignedStride(width, channels int) int { + stride := width * channels + alignmentPixels := 32 / channels // 8 for 4-channel, 16 for 2-channel, etc. + if alignmentPixels > 0 && width%alignmentPixels != 0 { + alignedWidth := width + alignmentPixels - (width % alignmentPixels) + stride = alignedWidth * channels + } + return stride +} + // NewFramebuffer creates a backing store for a pixel frame buffer with the specified dimensions. +// The buffer is allocated with 32-byte aligned strides for optimal SIMD performance. +// +// Buffer Allocation: The buffer is allocated assuming 4-channel (BGRA) pixel format with +// 32-byte alignment (8 pixels * 4 bytes = 32 bytes). This allocation is sufficient for +// all pixel types since: +// - 4-channel requires: alignedWidth * 4 * height bytes (allocated) +// - 3-channel requires: alignedWidth * 3 * height bytes (always <= 4-channel) +// - 1-channel requires: alignedWidth * 1 * height bytes (always <= 4-channel) +// +// The resizeMat method will use the appropriate stride based on the actual pixel type. func NewFramebuffer(width, height int) *Framebuffer { + stride := calculateAlignedStride(width, 4) // Allocate for 4-channel (BGRA) return &Framebuffer{ - buf: make([]byte, width*height*4), + buf: make([]byte, stride*height), mat: nil, } } @@ -243,7 +273,11 @@ func (f *Framebuffer) resizeMat(width, height int, pixelType PixelType) error { if pixelType.Depth() > 8 { pixelType = PixelType(C.opencv_type_convert_depth(C.int(pixelType), C.CV_8U)) } - newMat := C.opencv_mat_create_from_data(C.int(width), C.int(height), C.int(pixelType), unsafe.Pointer(&f.buf[0]), C.size_t(len(f.buf))) + + // Calculate aligned stride (32-byte aligned for SIMD performance) + stride := calculateAlignedStride(width, pixelType.Channels()) + + newMat := C.opencv_mat_create_from_data_with_stride(C.int(width), C.int(height), C.int(pixelType), unsafe.Pointer(&f.buf[0]), C.size_t(len(f.buf)), C.size_t(stride)) if newMat == nil { return ErrBufTooSmall } diff --git a/opencv.hpp b/opencv.hpp index 7123cc8b..c66c3115 100644 --- a/opencv.hpp +++ b/opencv.hpp @@ -98,6 +98,12 @@ opencv_mat opencv_mat_create_from_data(int width, int type, void* data, size_t data_len); +opencv_mat opencv_mat_create_from_data_with_stride(int width, + int height, + int type, + void* data, + size_t data_len, + size_t step); opencv_mat opencv_mat_create_empty_from_data(int length, void* data); bool opencv_mat_set_row_stride(opencv_mat mat, size_t stride); void opencv_mat_release(opencv_mat mat); @@ -108,6 +114,8 @@ void opencv_mat_resize(const opencv_mat src, int interpolation); opencv_mat opencv_mat_crop(const opencv_mat src, int x, int y, int width, int height); void opencv_mat_orientation_transform(CVImageOrientation orientation, opencv_mat mat); +void opencv_mat_copy_with_stride(const opencv_mat src, opencv_mat dst); +void opencv_mat_copy_to_packed_buffer(const opencv_mat src, void* dst_buffer, int width, int height, int channels); int opencv_mat_get_width(const opencv_mat mat); int opencv_mat_get_height(const opencv_mat mat); void* opencv_mat_get_data(const opencv_mat mat); diff --git a/ops.go b/ops.go index c6ce6d3b..c2376659 100644 --- a/ops.go +++ b/ops.go @@ -51,6 +51,12 @@ type ImageOptions struct { // DisableAnimatedOutput controls the encoder behavior when given a multi-frame input DisableAnimatedOutput bool + + // VideoFrameSampleIntervalMs controls the frame sampling rate for video inputs. + // For example, 100 means extract a frame every 100ms (10 FPS). + // If set to 0, only the first frame will be extracted (default behavior). + // This option only applies to video formats (MP4, MOV, WEBM). + VideoFrameSampleIntervalMs int } // ImageOps is a reusable object that can resize and encode images. @@ -259,11 +265,16 @@ func (o *ImageOps) encodeEmpty(e Encoder, opt map[int]int) ([]byte, error) { // skipToEnd advances the decoder to the final frame of an animation. // Returns io.EOF when the end is reached or an error if seeking fails. +// If the decoder doesn't support skipping, this is a no-op and returns io.EOF. func (o *ImageOps) skipToEnd(d Decoder) error { var err error for { err = d.SkipFrame() if err != nil { + // If skip is not supported, treat it as end-of-stream + if err == ErrSkipNotSupported { + return io.EOF + } return err } } @@ -395,6 +406,13 @@ func (o *ImageOps) transformCurrentFrame(d Decoder, opt *ImageOptions, inputHead // initializeTransform prepares for image transformation by reading the input header // and creating an appropriate encoder. Returns the header, encoder, and any error. func (o *ImageOps) initializeTransform(d Decoder, opt *ImageOptions, dst []byte) (*ImageHeader, Encoder, error) { + // Enable multi-frame video extraction if requested + if opt.VideoFrameSampleIntervalMs > 0 { + if vd, ok := d.(VideoDecoder); ok { + vd.SetFrameSampleInterval(opt.VideoFrameSampleIntervalMs) + } + } + inputHeader, err := d.Header() if err != nil { return nil, nil, err diff --git a/video_to_animated_test.go b/video_to_animated_test.go new file mode 100644 index 00000000..7db6a6b7 --- /dev/null +++ b/video_to_animated_test.go @@ -0,0 +1,483 @@ +package lilliput + +import ( + "os" + "testing" + "time" +) + +func TestVideoToAnimatedWebP(t *testing.T) { + t.Run("BasicVideoToAnimatedWebP", testBasicVideoToAnimatedWebP) + t.Run("VideoToAnimatedWebPWithCustomSampleInterval", testVideoToAnimatedWebPWithCustomSampleInterval) + t.Run("VideoToAnimatedWebPWithMaxFrames", testVideoToAnimatedWebPWithMaxFrames) + t.Run("VideoToAnimatedWebPWithZeroInterval", testVideoToAnimatedWebPWithZeroInterval) + t.Run("VideoToAnimatedWebPVerifyFrameCount", testVideoToAnimatedWebPVerifyFrameCount) + t.Run("VideoToAnimatedWebPWithResizing", testVideoToAnimatedWebPWithResizing) + t.Run("VideoToAnimatedWebPWithNonAlignedDimensions", testVideoToAnimatedWebPWithNonAlignedDimensions) +} + +func testBasicVideoToAnimatedWebP(t *testing.T) { + // Load a test video file + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + // Create decoder + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + // Prepare transformation options + dstBuf := make([]byte, 50*1024*1024) // 50MB buffer for output + options := &ImageOptions{ + FileType: ".webp", + Width: 200, + Height: 200, + ResizeMethod: ImageOpsFit, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: 1000, + EncodeTimeout: time.Second * 30, + } + + // Transform video to animated WebP + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + if len(output) == 0 { + t.Fatal("Transform returned empty output") + } + + // Verify output is a valid WebP + if len(output) < 12 { + t.Fatal("Output too small to be a valid WebP") + } + + // Check RIFF header + if string(output[0:4]) != "RIFF" { + t.Errorf("Expected RIFF header, got %s", string(output[0:4])) + } + + // Check WEBP signature + if string(output[8:12]) != "WEBP" { + t.Errorf("Expected WEBP signature, got %s", string(output[8:12])) + } + + // Write output for manual inspection if desired + if err := os.WriteFile("testdata/out/video_to_animated_basic.webp", output, 0644); err != nil { + t.Logf("Warning: Failed to write output file: %v", err) + } + + t.Logf("Successfully converted video to animated WebP, output size: %d bytes", len(output)) +} + +func testVideoToAnimatedWebPWithCustomSampleInterval(t *testing.T) { + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + testCases := []struct { + name string + interval int + maxSize int + }{ + {"500ms intervals", 500, 20}, + {"2 second intervals", 2000, 5}, + {"250ms intervals", 250, 40}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + dstBuf := make([]byte, 50*1024*1024) + options := &ImageOptions{ + FileType: ".webp", + Width: 200, + Height: 200, + ResizeMethod: ImageOpsFit, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: tc.interval, + EncodeTimeout: time.Second * 30, + } + + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + if len(output) == 0 { + t.Fatal("Transform returned empty output") + } + + t.Logf("Converted video with %s sample interval, output size: %d bytes", tc.name, len(output)) + }) + } +} + +func testVideoToAnimatedWebPWithMaxFrames(t *testing.T) { + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + testCases := []struct { + name string + maxFrames int + sampleInterval int + expectedFrameCount int + }{ + {"MaxFrames 5", 5, 1000, 5}, + {"MaxFrames 3", 3, 500, 3}, + {"MaxFrames 10", 10, 1000, 10}, + {"MaxFrames 1 (single frame)", 1, 1000, 1}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + dstBuf := make([]byte, 50*1024*1024) + options := &ImageOptions{ + FileType: ".webp", + Width: 200, + Height: 200, + ResizeMethod: ImageOpsFit, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: tc.sampleInterval, + MaxEncodeFrames: tc.maxFrames, + EncodeTimeout: time.Second * 30, + } + + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + if len(output) == 0 { + t.Fatal("Transform returned empty output") + } + + // Decode the output to verify frame count + outputDecoder, err := newWebpDecoder(output) + if err != nil { + t.Fatalf("Failed to decode output WebP: %v", err) + } + defer outputDecoder.Close() + + header, err := outputDecoder.Header() + if err != nil { + t.Fatalf("Failed to get header: %v", err) + } + + actualFrameCount := header.NumFrames() + if actualFrameCount != tc.expectedFrameCount { + t.Errorf("Expected %d frames, got %d frames", tc.expectedFrameCount, actualFrameCount) + } + + t.Logf("Successfully limited to %d frames (expected %d), output size: %d bytes", + actualFrameCount, tc.expectedFrameCount, len(output)) + }) + } +} + +func testVideoToAnimatedWebPWithZeroInterval(t *testing.T) { + // With interval = 0, should only extract first frame (default behavior) + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + dstBuf := make([]byte, 10*1024*1024) + options := &ImageOptions{ + FileType: ".webp", + Width: 200, + Height: 200, + ResizeMethod: ImageOpsFit, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: 0, // No multi-frame extraction + EncodeTimeout: time.Second * 10, + } + + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + if len(output) == 0 { + t.Fatal("Transform returned empty output") + } + + // Decode to verify it's a single frame + outputDecoder, err := newWebpDecoder(output) + if err != nil { + t.Fatalf("Failed to decode output WebP: %v", err) + } + defer outputDecoder.Close() + + header, err := outputDecoder.Header() + if err != nil { + t.Fatalf("Failed to get header: %v", err) + } + + if header.IsAnimated() { + t.Errorf("Expected single frame, but output is animated with %d frames", header.NumFrames()) + } + + t.Logf("Successfully extracted single frame with zero interval, output size: %d bytes", len(output)) +} + +func testVideoToAnimatedWebPVerifyFrameCount(t *testing.T) { + // Test that the frame count matches expectations based on duration and sample interval + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + + duration := decoder.Duration() + decoder.Close() + + t.Logf("Video duration: %v", duration) + + // Re-create decoder for transformation + decoder, err = newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + sampleIntervalMs := 2000 + expectedFrames := int(float64(duration.Milliseconds())/float64(sampleIntervalMs)) + 1 + + dstBuf := make([]byte, 50*1024*1024) + options := &ImageOptions{ + FileType: ".webp", + Width: 200, + Height: 200, + ResizeMethod: ImageOpsFit, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: sampleIntervalMs, + EncodeTimeout: time.Second * 30, + } + + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + // Verify frame count + outputDecoder, err := newWebpDecoder(output) + if err != nil { + t.Fatalf("Failed to decode output WebP: %v", err) + } + defer outputDecoder.Close() + + header, err := outputDecoder.Header() + if err != nil { + t.Fatalf("Failed to get header: %v", err) + } + + actualFrames := header.NumFrames() + + // Allow for small variance due to timing precision + if actualFrames < expectedFrames-1 || actualFrames > expectedFrames+1 { + t.Errorf("Expected approximately %d frames, got %d frames", expectedFrames, actualFrames) + } + + t.Logf("Video duration: %v, sample interval: %dms, expected ~%d frames, got %d frames", + duration, sampleIntervalMs, expectedFrames, actualFrames) +} + +func testVideoToAnimatedWebPWithResizing(t *testing.T) { + // Test different resize methods + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + testCases := []struct { + name string + width int + height int + resizeMethod ImageOpsSizeMethod + }{ + {"Fit 200x200", 200, 200, ImageOpsFit}, + {"Resize 300x200", 300, 200, ImageOpsResize}, + {"Fit 100x100", 100, 100, ImageOpsFit}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + dstBuf := make([]byte, 50*1024*1024) + options := &ImageOptions{ + FileType: ".webp", + Width: tc.width, + Height: tc.height, + ResizeMethod: tc.resizeMethod, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: 2000, + MaxEncodeFrames: 3, + EncodeTimeout: time.Second * 30, + } + + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed: %v", err) + } + + if len(output) == 0 { + t.Fatal("Transform returned empty output") + } + + // Verify dimensions + outputDecoder, err := newWebpDecoder(output) + if err != nil { + t.Fatalf("Failed to decode output WebP: %v", err) + } + defer outputDecoder.Close() + + header, err := outputDecoder.Header() + if err != nil { + t.Fatalf("Failed to get header: %v", err) + } + + // For Fit method, one dimension should match exactly + if tc.resizeMethod == ImageOpsFit { + if header.Width() != tc.width && header.Height() != tc.height { + t.Errorf("Expected at least one dimension to match request (%dx%d), got %dx%d", + tc.width, tc.height, header.Width(), header.Height()) + } + } + + t.Logf("%s: output dimensions %dx%d, size: %d bytes", + tc.name, header.Width(), header.Height(), len(output)) + }) + } +} + +func testVideoToAnimatedWebPWithNonAlignedDimensions(t *testing.T) { + // Test non-aligned dimensions to ensure stride handling works correctly + videoData, err := os.ReadFile("testdata/big_buck_bunny_480p_10s_web.mp4") + if err != nil { + t.Fatalf("Failed to read test video: %v", err) + } + + testCases := []struct { + name string + width int + height int + }{ + {"199x199 (not aligned to 8)", 199, 199}, + {"301x201 (neither aligned)", 301, 201}, + {"150x100 (partially aligned)", 150, 100}, + {"333x333 (not aligned)", 333, 333}, + } + + for _, tc := range testCases { + t.Run(tc.name, func(t *testing.T) { + decoder, err := newAVCodecDecoder(videoData) + if err != nil { + t.Fatalf("Failed to create decoder: %v", err) + } + defer decoder.Close() + + dstBuf := make([]byte, 50*1024*1024) + options := &ImageOptions{ + FileType: ".webp", + Width: tc.width, + Height: tc.height, + ResizeMethod: ImageOpsResize, + EncodeOptions: map[int]int{WebpQuality: 75}, + VideoFrameSampleIntervalMs: 2000, + MaxEncodeFrames: 3, + EncodeTimeout: time.Second * 30, + } + + ops := NewImageOps(2000) + defer ops.Close() + + output, err := ops.Transform(decoder, options, dstBuf) + if err != nil { + t.Fatalf("Transform failed with %dx%d: %v", tc.width, tc.height, err) + } + + if len(output) == 0 { + t.Fatal("Transform returned empty output") + } + + // Verify output is valid WebP + outputDecoder, err := newWebpDecoder(output) + if err != nil { + t.Fatalf("Failed to decode output WebP: %v", err) + } + defer outputDecoder.Close() + + header, err := outputDecoder.Header() + if err != nil { + t.Fatalf("Failed to get header: %v", err) + } + + // Verify dimensions match + if header.Width() != tc.width || header.Height() != tc.height { + t.Errorf("Expected dimensions %dx%d, got %dx%d", + tc.width, tc.height, header.Width(), header.Height()) + } + + // Verify it's animated + if !header.IsAnimated() { + t.Errorf("Expected animated WebP, got static image") + } + + t.Logf("Successfully created %dx%d animated WebP, %d frames, size: %d bytes", + header.Width(), header.Height(), header.NumFrames(), len(output)) + }) + } +} diff --git a/webp.cpp b/webp.cpp index c36abe0b..0b563f78 100644 --- a/webp.cpp +++ b/webp.cpp @@ -23,8 +23,6 @@ struct webp_decoder_struct { int prev_frame_y_offset; WebPMuxAnimDispose prev_frame_dispose; WebPMuxAnimBlend prev_frame_blend; - uint8_t* decode_buffer; - size_t decode_buffer_size; int total_duration; }; @@ -126,10 +124,6 @@ webp_decoder webp_decoder_create(const opencv_mat buf) d->total_duration = 0; } - // Pre-allocate decode buffer - d->decode_buffer_size = d->width * d->height * 4; // 4 channels for RGBA - d->decode_buffer = new uint8_t[d->decode_buffer_size]; - return d; } @@ -322,9 +316,6 @@ bool webp_decoder_decode(const webp_decoder d, opencv_mat mat) auto cvMat = static_cast(mat); cvMat->create(features.height, features.width, webp_decoder_get_pixel_type(d)); - // Recalculate row size based on the new dimensions - int row_size = cvMat->cols * cvMat->elemSize(); - // Store frame properties for future use d->prev_frame_delay_time = frame.duration; d->prev_frame_x_offset = frame.x_offset; @@ -332,31 +323,28 @@ bool webp_decoder_decode(const webp_decoder d, opencv_mat mat) d->prev_frame_dispose = frame.dispose_method; d->prev_frame_blend = frame.blend_method; - // Decode the frame + // Decode the frame directly into the Mat using its actual stride + // This ensures proper handling of any row alignment/padding uint8_t* res = nullptr; switch (webp_decoder_get_pixel_type(d)) { case CV_8UC4: res = WebPDecodeBGRAInto(frame.bitstream.bytes, frame.bitstream.size, - d->decode_buffer, - d->decode_buffer_size, - row_size); + cvMat->data, + cvMat->rows * cvMat->step, + cvMat->step); break; case CV_8UC3: res = WebPDecodeBGRInto(frame.bitstream.bytes, frame.bitstream.size, - d->decode_buffer, - d->decode_buffer_size, - row_size); + cvMat->data, + cvMat->rows * cvMat->step, + cvMat->step); break; default: return false; } - if (res) { - memcpy(cvMat->data, d->decode_buffer, cvMat->total() * cvMat->elemSize()); - } - WebPDataClear(&frame.bitstream); return res != nullptr; } @@ -370,7 +358,6 @@ void webp_decoder_release(webp_decoder d) if (d) { if (d->mux) WebPMuxDelete(d->mux); - delete[] d->decode_buffer; delete d; } }