@@ -52,6 +52,7 @@ import {
5252 WhisperForConditionalGeneration ,
5353 VisionEncoderDecoderModel ,
5454 Florence2ForConditionalGeneration ,
55+ Qwen2VLForConditionalGeneration ,
5556 MarianMTModel ,
5657
5758 // Pipelines
@@ -833,6 +834,96 @@ describe("Tiny random models", () => {
833834 } ) ;
834835 } ) ;
835836
837+ describe ( "qwen2_vl" , ( ) => {
838+ const CONVERSATION = [
839+ {
840+ role : "user" ,
841+ content : [ { type : "text" , text : "Hello" } ] ,
842+ } ,
843+ ] ;
844+
845+ // Example adapted from https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct
846+ const CONVERSATION_WITH_IMAGE = [
847+ {
848+ role : "user" ,
849+ content : [ { type : "image" } , { type : "text" , text : "Describe this image." } ] ,
850+ } ,
851+ ] ;
852+ // Empty white image
853+ const dims = [ 224 , 224 , 3 ] ;
854+ const image = new RawImage ( new Uint8ClampedArray ( dims [ 0 ] * dims [ 1 ] * dims [ 2 ] ) . fill ( 255 ) , ...dims ) ;
855+
856+ describe ( "Qwen2VLForConditionalGeneration" , ( ) => {
857+ const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration" ;
858+
859+ /** @type {Qwen2VLForConditionalGeneration } */
860+ let model ;
861+ /** @type {Qwen2VLProcessor } */
862+ let processor ;
863+ beforeAll ( async ( ) => {
864+ model = await Qwen2VLForConditionalGeneration . from_pretrained ( model_id , {
865+ // TODO move to config
866+ ...DEFAULT_MODEL_OPTIONS ,
867+ } ) ;
868+ processor = await AutoProcessor . from_pretrained ( model_id ) ;
869+ } , MAX_MODEL_LOAD_TIME ) ;
870+
871+ it (
872+ "forward" ,
873+ async ( ) => {
874+ const text = processor . apply_chat_template ( CONVERSATION_WITH_IMAGE , {
875+ add_generation_prompt : true ,
876+ } ) ;
877+ const inputs = await processor ( text , image ) ;
878+ const { logits } = await model ( inputs ) ;
879+ expect ( logits . dims ) . toEqual ( [ 1 , 89 , 152064 ] ) ;
880+ expect ( logits . mean ( ) . item ( ) ) . toBeCloseTo ( - 0.0011299321195110679 , 5 ) ;
881+ } ,
882+ MAX_TEST_EXECUTION_TIME ,
883+ ) ;
884+
885+ it (
886+ "text-only (batch_size=1)" ,
887+ async ( ) => {
888+ const text = processor . apply_chat_template ( CONVERSATION , {
889+ add_generation_prompt : true ,
890+ } ) ;
891+ const inputs = await processor ( text ) ;
892+ const generate_ids = await model . generate ( {
893+ ...inputs ,
894+ max_new_tokens : 10 ,
895+ } ) ;
896+
897+ const new_tokens = generate_ids . slice ( null , [ inputs . input_ids . dims . at ( - 1 ) , null ] ) ;
898+ expect ( new_tokens . tolist ( ) ) . toEqual ( [ [ 24284n , 63986n , 108860n , 84530n , 8889n , 23262n , 128276n , 64948n , 136757n , 138348n ] ] ) ;
899+ } ,
900+ MAX_TEST_EXECUTION_TIME ,
901+ ) ;
902+
903+ it (
904+ "text + image (batch_size=1)" ,
905+ async ( ) => {
906+ const text = processor . apply_chat_template ( CONVERSATION_WITH_IMAGE , {
907+ add_generation_prompt : true ,
908+ } ) ;
909+ const inputs = await processor ( text , image ) ;
910+ const generate_ids = await model . generate ( {
911+ ...inputs ,
912+ max_new_tokens : 10 ,
913+ } ) ;
914+
915+ const new_tokens = generate_ids . slice ( null , [ inputs . input_ids . dims . at ( - 1 ) , null ] ) ;
916+ expect ( new_tokens . tolist ( ) ) . toEqual ( [ [ 24284n , 35302n , 60575n , 38679n , 113390n , 115118n , 137596n , 38241n , 96726n , 142301n ] ] ) ;
917+ } ,
918+ MAX_TEST_EXECUTION_TIME ,
919+ ) ;
920+
921+ afterAll ( async ( ) => {
922+ await model ?. dispose ( ) ;
923+ } , MAX_MODEL_DISPOSE_TIME ) ;
924+ } ) ;
925+ } ) ;
926+
836927 describe ( "vision-encoder-decoder" , ( ) => {
837928 describe ( "VisionEncoderDecoderModel" , ( ) => {
838929 const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2" ;
0 commit comments