@@ -589,6 +589,7 @@ namespace vl
589589
590590 int video_max_frames = 20 ;
591591 bool arbitrary_resolution = false ;
592+ double fps = 1.0 ;
592593 };
593594
594595 void ChatHistoryEncoder::append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const
@@ -686,6 +687,7 @@ namespace vl
686687 Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
687688 tok->video_max_frames = utils::get_opt (args, " video_max_frames" , tok->video_max_frames );
688689 tok->arbitrary_resolution = utils::get_opt (args, " arbitrary_resolution" , false );
690+ tok->fps = utils::get_opt (args, " fps" , tok->fps );
689691 }
690692
691693 void before_generate (const GenerationConfig &gen_config) override
@@ -709,6 +711,12 @@ namespace vl
709711
710712 std::vector<std::unique_ptr<vision::VideoLoader>> videos;
711713
714+ std::unique_ptr<vision::Resize> resize;
715+ std::unique_ptr<vision::PreMaxImageSize> max_size;
716+
717+ if (!tok->arbitrary_resolution )
718+ resize.reset (new vision::Resize (896 , 896 ));
719+
712720 // expand video into images
713721 std::vector<ContentPiece> pieces;
714722 for (auto &piece : user.pieces )
@@ -719,16 +727,18 @@ namespace vl
719727 continue ;
720728 }
721729
722- // video is just like a collection of images.
723- // But, it's still not clear on fps.
724- // https://github.com/MoonshotAI/Kimi-VL/issues/24#issuecomment-2804163270
725- auto video = new vision::VideoLoader (piece.content .c_str (), 1 .0f , tok->video_max_frames );
730+ // ref: https://huggingface.co/blog/moonshotai/kimi-vl-a3b-thinking-2506
731+ auto video = new vision::VideoLoader (piece.content .c_str (), (float )tok->fps , tok->video_max_frames );
726732 videos.emplace_back (video);
727733 if (video->frames .size () < 1 )
728734 continue ;
729735
736+ if (max_size.get () == nullptr )
737+ max_size.reset (new vision::PreMaxImageSize (448 , 448 ));
738+
730739 for (size_t i = 0 ; i < video->frames .size (); i++)
731740 {
741+ pieces.emplace_back (utils::sec2hms (i / tok->fps , true ));
732742 pieces.emplace_back (video->frames [i], ContentPiece::Type::Image);
733743 }
734744 }
@@ -752,10 +762,6 @@ namespace vl
752762 vision::MaxGridHeight param3 (512 );
753763 vision::MaxGridWidth param4 (512 );
754764
755- std::unique_ptr<vision::Resize> resize;
756- if (!tok->arbitrary_resolution )
757- resize.reset (new vision::Resize (896 , 896 ));
758-
759765 vision::image_load (piece.content .c_str (), pixels, w, h, patch_size, vision::PaddingMode::Black);
760766
761767 std::vector<float > scaled;
0 commit comments