-
Notifications
You must be signed in to change notification settings - Fork 13.7k
Support video in mtmd (base API) #16910
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from 11 commits
ae099ec
d5b832d
f09abc0
e377686
2c1d02a
816b92b
efefc2a
32a45c4
5ee744f
6e8c9f6
1e9c563
113cfc2
ef68f2a
358be38
9396d6e
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -203,6 +203,7 @@ struct clip_hparams { | |
| // legacy | ||
| bool has_llava_projector = false; | ||
| int minicpmv_version = 0; | ||
| int minicpmv_max_slice_nums = 9; | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Search for |
||
| int32_t minicpmv_query_num = 0; // MiniCPM-V query number | ||
| }; | ||
|
|
||
|
|
@@ -3639,16 +3640,67 @@ struct llava_uhd { | |
| const bool has_slices = original_size.width > slice_size || original_size.height > slice_size; | ||
| const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty(); | ||
|
|
||
| if (!has_slices) { | ||
| // skip slicing logic | ||
| res.overview_size = clip_image_size{slice_size, slice_size}; | ||
| res.refined_size = clip_image_size{0, 0}; | ||
| res.grid_size = clip_image_size{0, 0}; | ||
| if (clip_is_minicpmv(ctx)) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What's the change here compared to the initial version? IMO I think this is more like a duplicated logic, see |
||
| auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); | ||
| res.overview_size = best_size; | ||
|
|
||
| { | ||
| const int max_slice_nums = ctx->model.hparams.minicpmv_max_slice_nums; | ||
| const float log_ratio = log((float)original_width / original_height); | ||
| const float ratio = (float)original_width * original_height / (slice_size * slice_size); | ||
| const int multiple = fmin(ceil(ratio), max_slice_nums); | ||
|
|
||
| auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); | ||
| auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); | ||
| res.grid_size = best_grid; | ||
| res.refined_size = refine_size; | ||
|
|
||
| LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", | ||
| __func__, original_width, original_height, | ||
| res.overview_size.width, res.overview_size.height, | ||
| res.refined_size.width, res.refined_size.height, | ||
| res.grid_size.width, res.grid_size.height); | ||
|
|
||
| if (!has_slices || max_slice_nums == 0) { | ||
| return res; | ||
| } | ||
|
|
||
| int width = refine_size.width; | ||
| int height = refine_size.height; | ||
| int grid_x = int(width / best_grid.width); | ||
| int grid_y = int(height / best_grid.height); | ||
| for (int patches_y = 0, ic = 0; | ||
| patches_y < refine_size.height && ic < best_grid.height; | ||
| patches_y += grid_y, ic += 1) { | ||
| for (int patches_x = 0, jc = 0; | ||
| patches_x < refine_size.width && jc < best_grid.width; | ||
| patches_x += grid_x, jc += 1) { | ||
| slice_coordinates slice; | ||
| slice.x = patches_x; | ||
| slice.y = patches_y; | ||
| slice.size.width = grid_x; | ||
| slice.size.height = grid_y; | ||
| res.slices.push_back(slice); | ||
| LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", | ||
| __func__, (int)res.slices.size() - 1, | ||
| slice.x, slice.y, slice.size.width, slice.size.height); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return res; | ||
| } | ||
| else { | ||
| if (!has_slices) { | ||
| // skip slicing logic | ||
| res.overview_size = clip_image_size{slice_size, slice_size}; | ||
| res.refined_size = clip_image_size{0, 0}; | ||
| res.grid_size = clip_image_size{0, 0}; | ||
|
|
||
| if (has_pinpoints) { | ||
| return res; | ||
| } | ||
|
|
||
| if (has_pinpoints) { | ||
| // has pinpoints, use them to calculate the grid size (e.g. llava-1.6) | ||
| auto refine_size = llava_uhd::select_best_resolution( | ||
| original_size, | ||
|
|
@@ -3684,53 +3736,7 @@ struct llava_uhd { | |
|
|
||
| return res; | ||
| } | ||
|
|
||
| // no pinpoints, dynamically calculate the grid size (e.g. minicpmv) | ||
|
|
||
| auto best_size = get_best_resize(original_size, slice_size, patch_size, !has_slices); | ||
| res.overview_size = best_size; | ||
|
|
||
| { | ||
| const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it | ||
| const float log_ratio = log((float)original_width / original_height); | ||
| const float ratio = (float)original_width * original_height / (slice_size * slice_size); | ||
| const int multiple = fmin(ceil(ratio), max_slice_nums); | ||
|
|
||
| auto best_grid = get_best_grid(max_slice_nums, multiple, log_ratio); | ||
| auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true); | ||
| res.grid_size = best_grid; | ||
| res.refined_size = refine_size; | ||
|
|
||
| LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n", | ||
| __func__, original_width, original_height, | ||
| res.overview_size.width, res.overview_size.height, | ||
| res.refined_size.width, res.refined_size.height, | ||
| res.grid_size.width, res.grid_size.height); | ||
|
|
||
| int width = refine_size.width; | ||
| int height = refine_size.height; | ||
| int grid_x = int(width / best_grid.width); | ||
| int grid_y = int(height / best_grid.height); | ||
| for (int patches_y = 0, ic = 0; | ||
| patches_y < refine_size.height && ic < best_grid.height; | ||
| patches_y += grid_y, ic += 1) { | ||
| for (int patches_x = 0, jc = 0; | ||
| patches_x < refine_size.width && jc < best_grid.width; | ||
| patches_x += grid_x, jc += 1) { | ||
| slice_coordinates slice; | ||
| slice.x = patches_x; | ||
| slice.y = patches_y; | ||
| slice.size.width = grid_x; | ||
| slice.size.height = grid_y; | ||
| res.slices.push_back(slice); | ||
| LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n", | ||
| __func__, (int)res.slices.size() - 1, | ||
| slice.x, slice.y, slice.size.width, slice.size.height); | ||
| } | ||
| } | ||
| } | ||
|
|
||
| return res; | ||
| } | ||
|
|
||
| static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) { | ||
|
|
@@ -4836,6 +4842,12 @@ bool clip_has_whisper_encoder(const struct clip_ctx * ctx) { | |
| || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL; | ||
| } | ||
|
|
||
| void clip_set_minicpmv_max_slice_nums(struct clip_ctx * ctx, int n) { | ||
| if (!ctx) return; | ||
| if (n < 0) n = 0; | ||
| ctx->model.hparams.minicpmv_max_slice_nums = n; | ||
| } | ||
|
|
||
| bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) { | ||
| clip_image_f32 clip_img; | ||
| clip_img.buf.resize(h * w * 3); | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this should be added as an env var instead. See
MTMD_DEBUG_GRAPHas an exampleUh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This issue has already been fixed in commit ef68f2a8bbf60e.