Skip to content

Commit 1eefcc7

Browse files
author
Gabriel Guralnick
committed
support for internvl_chat as well
1 parent 8de6410 commit 1eefcc7

File tree

1 file changed

+51
-2
lines changed

1 file changed

+51
-2
lines changed

src/llm_chat.ts

Lines changed: 51 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -940,6 +940,14 @@ export class LLMChatPipeline {
940940
const glbTokens = 12 * (12 + 1);
941941
return subTokens + 1 + glbTokens;
942942
}
943+
if (modelType === "internvl_chat") {
944+
const [cropH, cropW] = this.calculateCropShape(imageHeight, imageWidth);
945+
const maxTiles = this.config.model_config?.max_dynamic_patch ?? 12;
946+
// Include thumbnail only when max_dynamic_patch > 1 (with max_dynamic_patch=1,
947+
// thumbnail is identical to the single tile so compiled model skips it)
948+
const numTiles = cropH * cropW + (maxTiles > 1 ? 1 : 0);
949+
return numTiles * 256;
950+
}
943951
// For models with fixed embed size (e.g. Gemma3V)
944952
const mmTokens = this.config.model_config?.mm_tokens_per_image;
945953
if (mmTokens !== undefined) {
@@ -952,13 +960,45 @@ export class LLMChatPipeline {
952960
}
953961

954962
/**
955-
* Calculate resize dimensions for Phi3-V model.
963+
* Calculate resize dimensions based on model type.
956964
* Based on vlm_utils.cc CalculateResizeShape
957965
*/
958966
private calculateResizeShape(
959967
imageHeight: number,
960968
imageWidth: number,
961969
): [number, number] {
970+
const modelType = this.config.model_type;
971+
if (modelType === "internvl_chat") {
972+
const imageSize = 448;
973+
const maxTiles = this.config.model_config?.max_dynamic_patch ?? 12;
974+
const aspect = imageWidth / imageHeight;
975+
const area = imageWidth * imageHeight;
976+
977+
let bestI = 1,
978+
bestJ = 1;
979+
let bestDiff = Infinity;
980+
for (let n = 1; n <= maxTiles; n++) {
981+
for (let i = 1; i <= n; i++) {
982+
for (let j = 1; j <= n; j++) {
983+
if (i * j > maxTiles) continue;
984+
const targetAspect = i / j;
985+
const diff = Math.abs(targetAspect - aspect);
986+
if (diff < bestDiff) {
987+
bestDiff = diff;
988+
bestI = i;
989+
bestJ = j;
990+
} else if (diff === bestDiff) {
991+
if (area > 0.5 * imageSize * imageSize * i * j) {
992+
bestI = i;
993+
bestJ = j;
994+
}
995+
}
996+
}
997+
}
998+
}
999+
return [bestJ * imageSize, bestI * imageSize];
1000+
}
1001+
// phi3_v
9621002
const hdNum = 16;
9631003
const ratio = imageWidth / imageHeight;
9641004
let scale = 1;
@@ -972,13 +1012,22 @@ export class LLMChatPipeline {
9721012
}
9731013

9741014
/**
975-
* Calculate crop dimensions for Phi3-V model.
1015+
* Calculate crop dimensions based on model type.
9761016
* Based on vlm_utils.cc CalculateCropShape / CalculatePadShape
9771017
*/
9781018
private calculateCropShape(
9791019
imageHeight: number,
9801020
imageWidth: number,
9811021
): [number, number] {
1022+
const modelType = this.config.model_type;
1023+
if (modelType === "internvl_chat") {
1024+
const [resizeH, resizeW] = this.calculateResizeShape(
1025+
imageHeight,
1026+
imageWidth,
1027+
);
1028+
return [resizeH / 448, resizeW / 448];
1029+
}
1030+
// phi3_v
9821031
const [resizedHeight, resizedWidth] = this.calculateResizeShape(
9831032
imageHeight,
9841033
imageWidth,

0 commit comments

Comments
 (0)