@@ -940,6 +940,14 @@ export class LLMChatPipeline {
940940 const glbTokens = 12 * ( 12 + 1 ) ;
941941 return subTokens + 1 + glbTokens ;
942942 }
943+ if ( modelType === "internvl_chat" ) {
944+ const [ cropH , cropW ] = this . calculateCropShape ( imageHeight , imageWidth ) ;
945+ const maxTiles = this . config . model_config ?. max_dynamic_patch ?? 12 ;
946+ // Include thumbnail only when max_dynamic_patch > 1 (with max_dynamic_patch=1,
947+ // thumbnail is identical to the single tile so compiled model skips it)
948+ const numTiles = cropH * cropW + ( maxTiles > 1 ? 1 : 0 ) ;
949+ return numTiles * 256 ;
950+ }
943951 // For models with fixed embed size (e.g. Gemma3V)
944952 const mmTokens = this . config . model_config ?. mm_tokens_per_image ;
945953 if ( mmTokens !== undefined ) {
@@ -952,13 +960,45 @@ export class LLMChatPipeline {
952960 }
953961
954962 /**
955- * Calculate resize dimensions for Phi3-V model.
963+ * Calculate resize dimensions based on model type .
956964 * Based on vlm_utils.cc CalculateResizeShape
957965 */
958966 private calculateResizeShape (
959967 imageHeight : number ,
960968 imageWidth : number ,
961969 ) : [ number , number ] {
970+ const modelType = this . config . model_type ;
971+ if ( modelType === "internvl_chat" ) {
972+ const imageSize = 448 ;
973+ const maxTiles = this . config . model_config ?. max_dynamic_patch ?? 12 ;
974+ const aspect = imageWidth / imageHeight ;
975+ const area = imageWidth * imageHeight ;
976+
977+ let bestI = 1 ,
978+ bestJ = 1 ;
979+ let bestDiff = Infinity ;
980+ for ( let n = 1 ; n <= maxTiles ; n ++ ) {
981+ for ( let i = 1 ; i <= n ; i ++ ) {
982+ for ( let j = 1 ; j <= n ; j ++ ) {
983+ if ( i * j > maxTiles ) continue ;
984+ const targetAspect = i / j ;
985+ const diff = Math . abs ( targetAspect - aspect ) ;
986+ if ( diff < bestDiff ) {
987+ bestDiff = diff ;
988+ bestI = i ;
989+ bestJ = j ;
990+ } else if ( diff === bestDiff ) {
991+ if ( area > 0.5 * imageSize * imageSize * i * j ) {
992+ bestI = i ;
993+ bestJ = j ;
994+ }
995+ }
996+ }
997+ }
998+ }
999+ return [ bestJ * imageSize , bestI * imageSize ] ;
1000+ }
1001+ // phi3_v
9621002 const hdNum = 16 ;
9631003 const ratio = imageWidth / imageHeight ;
9641004 let scale = 1 ;
@@ -972,13 +1012,22 @@ export class LLMChatPipeline {
9721012 }
9731013
9741014 /**
975- * Calculate crop dimensions for Phi3-V model.
1015+ * Calculate crop dimensions based on model type .
9761016 * Based on vlm_utils.cc CalculateCropShape / CalculatePadShape
9771017 */
9781018 private calculateCropShape (
9791019 imageHeight : number ,
9801020 imageWidth : number ,
9811021 ) : [ number , number ] {
1022+ const modelType = this . config . model_type ;
1023+ if ( modelType === "internvl_chat" ) {
1024+ const [ resizeH , resizeW ] = this . calculateResizeShape (
1025+ imageHeight ,
1026+ imageWidth ,
1027+ ) ;
1028+ return [ resizeH / 448 , resizeW / 448 ] ;
1029+ }
1030+ // phi3_v
9821031 const [ resizedHeight , resizedWidth ] = this . calculateResizeShape (
9831032 imageHeight ,
9841033 imageWidth ,
0 commit comments