@@ -588,6 +588,7 @@ namespace vl
588588 int media_pad_token_id;
589589
590590 int video_max_frames = 20 ;
591+ bool arbitrary_resolution = false ;
591592 };
592593
593594 void ChatHistoryEncoder::append_ai (int round_idx, const std::string &ai, std::vector<int > &ids) const
@@ -683,7 +684,8 @@ namespace vl
683684 void set_additional_args (const std::map<std::string, std::string> &args) override
684685 {
685686 Tokenizer *tok = dynamic_cast <Tokenizer *>(tokenizer);
686- tok->video_max_frames = utils::get_opt (args, " video_max_frames" , tok->video_max_frames );
687+ tok->video_max_frames = utils::get_opt (args, " video_max_frames" , tok->video_max_frames );
688+ tok->arbitrary_resolution = utils::get_opt (args, " arbitrary_resolution" , false );
687689 }
688690
689691 void before_generate (const GenerationConfig &gen_config) override
@@ -750,6 +752,10 @@ namespace vl
750752 vision::MaxGridHeight param3 (512 );
751753 vision::MaxGridWidth param4 (512 );
752754
755+ std::unique_ptr<vision::Resize> resize;
756+ if (!tok->arbitrary_resolution )
757+ resize.reset (new vision::Resize (896 , 896 ));
758+
753759 vision::image_load (piece.content .c_str (), pixels, w, h, patch_size, vision::PaddingMode::Black);
754760
755761 std::vector<float > scaled;
0 commit comments