|
40 | 40 | #include <map> |
41 | 41 | #include <regex> |
42 | 42 | #include <stdexcept> |
| 43 | +#include <unordered_set> |
43 | 44 | #include <vector> |
44 | 45 | #include <sstream> |
45 | 46 | #include <cinttypes> |
@@ -447,7 +448,7 @@ struct clip_hparams { |
447 | 448 |
|
448 | 449 | std::vector<int32_t> image_grid_pinpoints; |
449 | 450 | int32_t image_crop_resolution; |
450 | | - std::vector<int32_t> vision_feature_layer; |
| 451 | + std::unordered_set<int32_t> vision_feature_layer; |
451 | 452 | }; |
452 | 453 |
|
453 | 454 | struct clip_layer { |
@@ -756,18 +757,16 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 |
756 | 757 | } |
757 | 758 |
|
758 | 759 | std::vector<struct ggml_tensor *> embedding_stack; |
| 760 | + const auto & vision_feature_layer = hparams.vision_feature_layer; |
759 | 761 |
|
760 | 762 | // loop over layers |
761 | 763 | for (int il = 0; il < ctx->max_feature_layer; il++) { |
762 | 764 | struct ggml_tensor * cur = embeddings; // embeddings = residual, cur = hidden_states |
763 | 765 |
|
764 | 766 | // If this is an embedding feature layer, save the output. |
765 | 767 | // NOTE: 0 index here refers to the input to the encoder. |
766 | | - for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) { |
767 | | - if (il == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) { |
768 | | - embedding_stack.push_back(embeddings); |
769 | | - break; |
770 | | - } |
| 768 | + if (vision_feature_layer.find(il) != vision_feature_layer.end()) { |
| 769 | + embedding_stack.push_back(embeddings); |
771 | 770 | } |
772 | 771 |
|
773 | 772 | //const size_t nb_q_w = model.layers[il].q_w->nb[0]; |
@@ -868,11 +867,8 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32 |
868 | 867 | } |
869 | 868 |
|
870 | 869 | // final layer is a vision feature layer |
871 | | - for (size_t vl_idx = 0; vl_idx < hparams.vision_feature_layer.size(); vl_idx++) { |
872 | | - if (n_layer == ctx->vision_model.hparams.vision_feature_layer[vl_idx]) { |
873 | | - embedding_stack.push_back(embeddings); |
874 | | - break; |
875 | | - } |
| 870 | + if (vision_feature_layer.find(n_layer) != vision_feature_layer.end()) { |
| 871 | + embedding_stack.push_back(embeddings); |
876 | 872 | } |
877 | 873 |
|
878 | 874 | // If feature layers are explicitly set, stack them (if we have multiple) |
@@ -1486,7 +1482,7 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { |
1486 | 1482 | const int32_t * vision_feature_layer = (const int32_t *)gguf_get_arr_data(ctx, idx); |
1487 | 1483 |
|
1488 | 1484 | for (int i = 0; i < n; ++i) { |
1489 | | - hparams.vision_feature_layer.push_back(vision_feature_layer[i]); |
| 1485 | + hparams.vision_feature_layer.insert(vision_feature_layer[i]); |
1490 | 1486 | } |
1491 | 1487 | } catch (std::runtime_error & /*e*/) { } |
1492 | 1488 |
|
@@ -1530,13 +1526,13 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) { |
1530 | 1526 | LOG_INF("v_image_mean %f %f %f\n", new_clip->image_mean[0], new_clip->image_mean[1], new_clip->image_mean[2]); |
1531 | 1527 | LOG_INF("v_image_std %f %f %f\n", new_clip->image_std[0], new_clip->image_std[1], new_clip->image_std[2]); |
1532 | 1528 | LOG_INF("v_image_grid_pinpoints: "); |
1533 | | - for (size_t i = 0; i < hparams.image_grid_pinpoints.size(); ++i) { |
1534 | | - LOG_INF("%d ", hparams.image_grid_pinpoints[i]); |
| 1529 | + for (const auto & pp : hparams.image_grid_pinpoints) { |
| 1530 | + LOG_INF("%d ", pp); |
1535 | 1531 | } |
1536 | 1532 | LOG_INF("\n"); |
1537 | 1533 | LOG_INF("v_vision_feature_layer: "); |
1538 | | - for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) { |
1539 | | - LOG_INF("%d ", hparams.vision_feature_layer[i]); |
| 1534 | + for (const auto & feature_layer: hparams.vision_feature_layer) { |
| 1535 | + LOG_INF("%d ", feature_layer); |
1540 | 1536 | } |
1541 | 1537 | LOG_INF("\n"); |
1542 | 1538 | LOG_INF("v_mm_patch_merge_type: %s\n", hparams.mm_patch_merge_type); |
@@ -2997,9 +2993,9 @@ int get_deepest_feature_layer(const struct clip_ctx * ctx) { |
2997 | 2993 | } |
2998 | 2994 |
|
2999 | 2995 | // If we set explicit vision feature layers, only go up to the deepest one |
3000 | | - for (size_t i = 0; i < hparams.vision_feature_layer.size(); i++) { |
3001 | | - if (hparams.vision_feature_layer[i] > deepest_feature_layer) { |
3002 | | - deepest_feature_layer = hparams.vision_feature_layer[i]; |
| 2996 | + for (const auto & feature_layer: hparams.vision_feature_layer) { |
| 2997 | + if (feature_layer > deepest_feature_layer) { |
| 2998 | + deepest_feature_layer = feature_layer; |
3003 | 2999 | } |
3004 | 3000 | } |
3005 | 3001 | return deepest_feature_layer < 0 ? n_layer: deepest_feature_layer; |
|
0 commit comments