Skip to content

Commit e47bd3d

Browse files
committed
update video input & thought detection for kimi-vl
1 parent d47a838 commit e47bd3d

File tree

5 files changed

+36
-18
lines changed

5 files changed

+36
-18
lines changed

models/kimi.cpp

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -589,6 +589,7 @@ namespace vl
589589

590590
int video_max_frames = 20;
591591
bool arbitrary_resolution = false;
592+
double fps = 1.0;
592593
};
593594

594595
void ChatHistoryEncoder::append_ai(int round_idx, const std::string &ai, std::vector<int> &ids) const
@@ -686,6 +687,7 @@ namespace vl
686687
Tokenizer *tok = dynamic_cast<Tokenizer *>(tokenizer);
687688
tok->video_max_frames = utils::get_opt(args, "video_max_frames", tok->video_max_frames);
688689
tok->arbitrary_resolution = utils::get_opt(args, "arbitrary_resolution", false);
690+
tok->fps = utils::get_opt(args, "fps", tok->fps);
689691
}
690692

691693
void before_generate(const GenerationConfig &gen_config) override
@@ -709,6 +711,12 @@ namespace vl
709711

710712
std::vector<std::unique_ptr<vision::VideoLoader>> videos;
711713

714+
std::unique_ptr<vision::Resize> resize;
715+
std::unique_ptr<vision::PreMaxImageSize> max_size;
716+
717+
if (!tok->arbitrary_resolution)
718+
resize.reset(new vision::Resize(896, 896));
719+
712720
// expand video into images
713721
std::vector<ContentPiece> pieces;
714722
for (auto &piece : user.pieces)
@@ -719,16 +727,18 @@ namespace vl
719727
continue;
720728
}
721729

722-
// video is just like a collection of images.
723-
// But, it's still not clear on fps.
724-
// https://github.com/MoonshotAI/Kimi-VL/issues/24#issuecomment-2804163270
725-
auto video = new vision::VideoLoader(piece.content.c_str(), 1.0f, tok->video_max_frames);
730+
// ref: https://huggingface.co/blog/moonshotai/kimi-vl-a3b-thinking-2506
731+
auto video = new vision::VideoLoader(piece.content.c_str(), (float)tok->fps, tok->video_max_frames);
726732
videos.emplace_back(video);
727733
if (video->frames.size() < 1)
728734
continue;
729735

736+
if (max_size.get() == nullptr)
737+
max_size.reset(new vision::PreMaxImageSize(448, 448));
738+
730739
for (size_t i = 0; i < video->frames.size(); i++)
731740
{
741+
pieces.emplace_back(utils::sec2hms(i / tok->fps, true));
732742
pieces.emplace_back(video->frames[i], ContentPiece::Type::Image);
733743
}
734744
}
@@ -752,10 +762,6 @@ namespace vl
752762
vision::MaxGridHeight param3(512);
753763
vision::MaxGridWidth param4(512);
754764

755-
std::unique_ptr<vision::Resize> resize;
756-
if (!tok->arbitrary_resolution)
757-
resize.reset(new vision::Resize(896, 896));
758-
759765
vision::image_load(piece.content.c_str(), pixels, w, h, patch_size, vision::PaddingMode::Black);
760766

761767
std::vector<float> scaled;

src/basics.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -76,8 +76,8 @@ namespace utils
7676

7777
std::string num2words(int value);
7878

79-
std::string sec2hms(float seconds, bool show_ms = false);
80-
std::string sec2ms(float seconds, bool show_ms = false);
79+
std::string sec2hms(double seconds, bool hour_2digits = false, bool show_ms = false);
80+
std::string sec2ms(double seconds, bool show_ms = false);
8181

8282
// create a unique temp file name (full path)
8383
std::string tmpname(void);

src/chat.cpp

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -457,8 +457,10 @@ namespace chatllm
457457

458458
void ThoughtChunkInterceptor::init(std::vector<std::pair<std::string, std::string>> tags)
459459
{
460-
this->tags = tags;
460+
this->tags = tags;
461461
active = tags.size() > 0;
462+
if (active)
463+
std::sort(this->tags.begin(), this->tags.end(), [](auto& a, auto& b) { return a.first.size() < b.first.size(); });
462464
}
463465

464466
void ThoughtChunkInterceptor::put_chunk(bool first, const std::string &chunk)

src/main.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,11 @@ static chatllm::Pipeline::ExtendingMethod parse_extending_method(const std::stri
112112
return chatllm::Pipeline::ExtendingMethod::None;
113113
}
114114

115-
// sorted by length!
116115
const std::vector<std::pair<std::string, std::string>> THOUGHT_TAGS = {
117116
{"<think>", "</think>"},
117+
{"◁think▷", "◁/think▷"},
118118
{"<thought>", "</thought>"},
119-
{"<reasoning>", "</reasoning>"}
119+
{"<reasoning>", "</reasoning>"},
120120
};
121121

122122
static std::string show_default_thought_tags(void)

src/vectorstore.cpp

Lines changed: 15 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -542,7 +542,7 @@ namespace utils
542542
return result;
543543
}
544544

545-
std::string sec2hms(float seconds, bool show_ms)
545+
std::string sec2hms(double seconds, bool hour_2digits, bool show_ms)
546546
{
547547
int sec = (int)seconds;
548548
int ms = (int)((seconds - sec) * 1000000);
@@ -551,14 +551,24 @@ namespace utils
551551
sec %= 60;
552552
min %= 60;
553553
char s[100];
554-
if (show_ms)
555-
sprintf(s, "%d:%02d:%02d.%06d", hh, min, sec, ms);
554+
if (hour_2digits)
555+
{
556+
if (show_ms)
557+
sprintf(s, "%02d:%02d:%02d.%06d", hh, min, sec, ms);
558+
else
559+
sprintf(s, "%02d:%02d:%02d", hh, min, sec);
560+
}
556561
else
557-
sprintf(s, "%d:%02d:%02d", hh, min, sec);
562+
{
563+
if (show_ms)
564+
sprintf(s, "%d:%02d:%02d.%06d", hh, min, sec, ms);
565+
else
566+
sprintf(s, "%d:%02d:%02d", hh, min, sec);
567+
}
558568
return s;
559569
}
560570

561-
std::string sec2ms(float seconds, bool show_ms)
571+
std::string sec2ms(double seconds, bool show_ms)
562572
{
563573
int sec = (int)seconds;
564574
int ms = (int)((seconds - sec) * 1000000);

0 commit comments

Comments
 (0)