update video input & thought detection for kimi-vl

foldl · foldl · commit e47bd3db2bab · 2025-06-24T14:48:24.000+08:00
diff --git a/models/kimi.cpp b/models/kimi.cpp
@@ -589,6 +589,7 @@ namespace vl
 
         int video_max_frames = 20;
         bool arbitrary_resolution = false;
+        double fps = 1.0;
     };
 
     void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
@@ -686,6 +687,7 @@ namespace vl
             Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
             tok->video_max_frames       = utils::get_opt(args, "video_max_frames", tok->video_max_frames);
             tok->arbitrary_resolution   = utils::get_opt(args, "arbitrary_resolution", false);
+            tok->fps                    = utils::get_opt(args, "fps", tok->fps);
         }
 
         void before_generate(const GenerationConfig &gen_config) override
@@ -709,6 +711,12 @@ namespace vl
 
         std::vector<std::unique_ptr<vision::VideoLoader>> videos;
 
+        std::unique_ptr<vision::Resize> resize;
+        std::unique_ptr<vision::PreMaxImageSize> max_size;
+
+        if (!tok->arbitrary_resolution)
+            resize.reset(new vision::Resize(896, 896));
+
         // expand video into images
         std::vector<ContentPiece> pieces;
         for (auto &piece : user.pieces)
@@ -719,16 +727,18 @@ namespace vl
                 continue;
             }
 
-            // video is just like a collection of images.
-            // But, it's still not clear on fps.
-            // https://github.com/MoonshotAI/Kimi-VL/issues/24#issuecomment-2804163270
-            auto video = new vision::VideoLoader(piece.content.c_str(), 1.0f, tok->video_max_frames);
+            // ref: https://huggingface.co/blog/moonshotai/kimi-vl-a3b-thinking-2506
+            auto video = new vision::VideoLoader(piece.content.c_str(), (float)tok->fps, tok->video_max_frames);
             videos.emplace_back(video);
             if (video->frames.size() < 1)
                 continue;
 
+            if (max_size.get() == nullptr)
+                max_size.reset(new vision::PreMaxImageSize(448, 448));
+
             for (size_t i = 0; i < video->frames.size(); i++)
             {
+                pieces.emplace_back(utils::sec2hms(i / tok->fps, true));
                 pieces.emplace_back(video->frames[i], ContentPiece::Type::Image);
             }
         }
@@ -752,10 +762,6 @@ namespace vl
                 vision::MaxGridHeight   param3(512);
                 vision::MaxGridWidth    param4(512);
 
-                std::unique_ptr<vision::Resize> resize;
-                if (!tok->arbitrary_resolution)
-                    resize.reset(new vision::Resize(896, 896));
-
                 vision::image_load(piece.content.c_str(), pixels, w, h, patch_size, vision::PaddingMode::Black);
 
                 std::vector<float> scaled;
diff --git a/src/basics.h b/src/basics.h
@@ -76,8 +76,8 @@ namespace utils
 
     std::string num2words(int value);
 
-    std::string sec2hms(float seconds, bool show_ms = false);
-    std::string sec2ms(float seconds, bool show_ms = false);
+    std::string sec2hms(double seconds, bool hour_2digits = false, bool show_ms = false);
+    std::string sec2ms(double seconds, bool show_ms = false);
 
     // create a unique temp file name (full path)
     std::string tmpname(void);
diff --git a/src/chat.cpp b/src/chat.cpp
@@ -457,8 +457,10 @@ namespace chatllm
 
     void ThoughtChunkInterceptor::init(std::vector<std::pair<std::string, std::string>> tags)
     {
-        this->tags = tags;
+        this->tags = tags;        
         active = tags.size() > 0;
+        if (active)
+            std::sort(this->tags.begin(), this->tags.end(), [](auto& a, auto& b) { return a.first.size() < b.first.size(); });
     }
 
     void ThoughtChunkInterceptor::put_chunk(bool first, const std::string &chunk)
diff --git a/src/main.cpp b/src/main.cpp
@@ -112,11 +112,11 @@ static chatllm::Pipeline::ExtendingMethod parse_extending_method(const std::stri
         return chatllm::Pipeline::ExtendingMethod::None;
 }
 
-// sorted by length!
 const std::vector<std::pair<std::string, std::string>> THOUGHT_TAGS = {
     {"<think>",         "</think>"},
+    {"◁think▷",         "◁/think▷"},
     {"<thought>",       "</thought>"},
-    {"<reasoning>",     "</reasoning>"}
+    {"<reasoning>",     "</reasoning>"},
 };
 
 static std::string show_default_thought_tags(void)
diff --git a/src/vectorstore.cpp b/src/vectorstore.cpp
@@ -542,7 +542,7 @@ namespace utils
         return result;
     }
 
-    std::string sec2hms(float seconds, bool show_ms)
+    std::string sec2hms(double seconds, bool hour_2digits, bool show_ms)
     {
         int sec = (int)seconds;
         int ms  = (int)((seconds - sec) * 1000000);
@@ -551,14 +551,24 @@ namespace utils
         sec %= 60;
         min %= 60;
         char s[100];
-        if (show_ms)
-            sprintf(s, "%d:%02d:%02d.%06d", hh, min, sec, ms);
+        if (hour_2digits)
+        {
+            if (show_ms)
+                sprintf(s, "%02d:%02d:%02d.%06d", hh, min, sec, ms);
+            else
+                sprintf(s, "%02d:%02d:%02d", hh, min, sec);
+        }
         else
-            sprintf(s, "%d:%02d:%02d", hh, min, sec);
+        {
+            if (show_ms)
+                sprintf(s, "%d:%02d:%02d.%06d", hh, min, sec, ms);
+            else
+                sprintf(s, "%d:%02d:%02d", hh, min, sec);
+        }
         return s;
     }
 
-    std::string sec2ms(float seconds, bool show_ms)
+    std::string sec2ms(double seconds, bool show_ms)
     {
         int sec = (int)seconds;
         int ms  = (int)((seconds - sec) * 1000000);

Original file line number	Diff line number	Diff line change
`@@ -457,8 +457,10 @@ namespace chatllm`
`457`	`457`
`458`	`458`	`void ThoughtChunkInterceptor::init(std::vector<std::pair<std::string, std::string>> tags)`
`459`	`459`	`{`
`460`		`- this->tags = tags;`
	`460`	`+ this->tags = tags;`
`461`	`461`	`active = tags.size() > 0;`
	`462`	`+ if (active)`
	`463`	`+ std::sort(this->tags.begin(), this->tags.end(), [](auto& a, auto& b) { return a.first.size() < b.first.size(); });`
`462`	`464`	`}`
`463`	`465`
`464`	`466`	`void ThoughtChunkInterceptor::put_chunk(bool first, const std::string &chunk)`