@@ -20,6 +20,7 @@ import {
2020 Processor ,
2121 Florence2Processor ,
2222 Idefics3Processor ,
23+ PaliGemmaProcessor ,
2324
2425 // Models
2526 LlamaForCausalLM ,
@@ -54,6 +55,7 @@ import {
5455 VisionEncoderDecoderModel ,
5556 Florence2ForConditionalGeneration ,
5657 Qwen2VLForConditionalGeneration ,
58+ PaliGemmaForConditionalGeneration ,
5759 MarianMTModel ,
5860 PatchTSTModel ,
5961 PatchTSTForPrediction ,
@@ -1072,6 +1074,58 @@ describe("Tiny random models", () => {
10721074 } ) ;
10731075 } ) ;
10741076
1077+ describe ( "paligemma" , ( ) => {
1078+ const text = "<image>What is on the flower?" ;
1079+
1080+ // Empty white image
1081+ const dims = [ 224 , 224 , 3 ] ;
1082+ const image = new RawImage ( new Uint8ClampedArray ( dims [ 0 ] * dims [ 1 ] * dims [ 2 ] ) . fill ( 255 ) , ...dims ) ;
1083+
1084+ describe ( "PaliGemmaForConditionalGeneration" , ( ) => {
1085+ const model_id = "hf-internal-testing/tiny-random-PaliGemmaForConditionalGeneration" ;
1086+
1087+ /** @type {PaliGemmaForConditionalGeneration } */
1088+ let model ;
1089+ /** @type {PaliGemmaProcessor } */
1090+ let processor ;
1091+ beforeAll ( async ( ) => {
1092+ model = await PaliGemmaForConditionalGeneration . from_pretrained ( model_id , {
1093+ // TODO move to config
1094+ ...DEFAULT_MODEL_OPTIONS ,
1095+ } ) ;
1096+ processor = await AutoProcessor . from_pretrained ( model_id ) ;
1097+ } , MAX_MODEL_LOAD_TIME ) ;
1098+
1099+ it (
1100+ "forward" ,
1101+ async ( ) => {
1102+ const inputs = await processor ( image , text ) ;
1103+
1104+ const { logits } = await model ( inputs ) ;
1105+ expect ( logits . dims ) . toEqual ( [ 1 , 264 , 257216 ] ) ;
1106+ expect ( logits . mean ( ) . item ( ) ) . toBeCloseTo ( - 0.0023024685215204954 , 6 ) ;
1107+ } ,
1108+ MAX_TEST_EXECUTION_TIME ,
1109+ ) ;
1110+
1111+ it (
1112+ "batch_size=1" ,
1113+ async ( ) => {
1114+ const inputs = await processor ( image , text ) ;
1115+ const generate_ids = await model . generate ( { ...inputs , max_new_tokens : 10 } ) ;
1116+
1117+ const new_tokens = generate_ids . slice ( null , [ inputs . input_ids . dims . at ( - 1 ) , null ] ) ;
1118+ expect ( new_tokens . tolist ( ) ) . toEqual ( [ [ 91711n , 24904n , 144054n , 124983n , 83862n , 124983n , 124983n , 124983n , 141236n , 124983n ] ] ) ;
1119+ } ,
1120+ MAX_TEST_EXECUTION_TIME ,
1121+ ) ;
1122+
1123+ afterAll ( async ( ) => {
1124+ await model ?. dispose ( ) ;
1125+ } , MAX_MODEL_DISPOSE_TIME ) ;
1126+ } ) ;
1127+ } ) ;
1128+
10751129 describe ( "vision-encoder-decoder" , ( ) => {
10761130 describe ( "VisionEncoderDecoderModel" , ( ) => {
10771131 const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2" ;
0 commit comments