Skip to content

Commit e28ac7a

Browse files
committed
Add Qwen2VLForConditionalGeneration unit tests
1 parent 2e945a5 commit e28ac7a

File tree

1 file changed

+91
-0
lines changed

1 file changed

+91
-0
lines changed

tests/tiny_random.test.js

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,7 @@ import {
5252
WhisperForConditionalGeneration,
5353
VisionEncoderDecoderModel,
5454
Florence2ForConditionalGeneration,
55+
Qwen2VLForConditionalGeneration,
5556
MarianMTModel,
5657

5758
// Pipelines
@@ -833,6 +834,96 @@ describe("Tiny random models", () => {
833834
});
834835
});
835836

837+
describe("qwen2_vl", () => {
838+
const CONVERSATION = [
839+
{
840+
role: "user",
841+
content: [{ type: "text", text: "Hello" }],
842+
},
843+
];
844+
845+
// Example adapted from https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct
846+
const CONVERSATION_WITH_IMAGE = [
847+
{
848+
role: "user",
849+
content: [{ type: "image" }, { type: "text", text: "Describe this image." }],
850+
},
851+
];
852+
// Empty white image
853+
const dims = [224, 224, 3];
854+
const image = new RawImage(new Uint8ClampedArray(dims[0] * dims[1] * dims[2]).fill(255), ...dims);
855+
856+
describe("Qwen2VLForConditionalGeneration", () => {
857+
const model_id = "hf-internal-testing/tiny-random-Qwen2VLForConditionalGeneration";
858+
859+
/** @type {Qwen2VLForConditionalGeneration} */
860+
let model;
861+
/** @type {Qwen2VLProcessor} */
862+
let processor;
863+
beforeAll(async () => {
864+
model = await Qwen2VLForConditionalGeneration.from_pretrained(model_id, {
865+
// TODO move to config
866+
...DEFAULT_MODEL_OPTIONS,
867+
});
868+
processor = await AutoProcessor.from_pretrained(model_id);
869+
}, MAX_MODEL_LOAD_TIME);
870+
871+
it(
872+
"forward",
873+
async () => {
874+
const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, {
875+
add_generation_prompt: true,
876+
});
877+
const inputs = await processor(text, image);
878+
const { logits } = await model(inputs);
879+
expect(logits.dims).toEqual([1, 89, 152064]);
880+
expect(logits.mean().item()).toBeCloseTo(-0.0011299321195110679, 5);
881+
},
882+
MAX_TEST_EXECUTION_TIME,
883+
);
884+
885+
it(
886+
"text-only (batch_size=1)",
887+
async () => {
888+
const text = processor.apply_chat_template(CONVERSATION, {
889+
add_generation_prompt: true,
890+
});
891+
const inputs = await processor(text);
892+
const generate_ids = await model.generate({
893+
...inputs,
894+
max_new_tokens: 10,
895+
});
896+
897+
const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
898+
expect(new_tokens.tolist()).toEqual([[24284n, 63986n, 108860n, 84530n, 8889n, 23262n, 128276n, 64948n, 136757n, 138348n]]);
899+
},
900+
MAX_TEST_EXECUTION_TIME,
901+
);
902+
903+
it(
904+
"text + image (batch_size=1)",
905+
async () => {
906+
const text = processor.apply_chat_template(CONVERSATION_WITH_IMAGE, {
907+
add_generation_prompt: true,
908+
});
909+
const inputs = await processor(text, image);
910+
const generate_ids = await model.generate({
911+
...inputs,
912+
max_new_tokens: 10,
913+
});
914+
915+
const new_tokens = generate_ids.slice(null, [inputs.input_ids.dims.at(-1), null]);
916+
expect(new_tokens.tolist()).toEqual([[24284n, 35302n, 60575n, 38679n, 113390n, 115118n, 137596n, 38241n, 96726n, 142301n]]);
917+
},
918+
MAX_TEST_EXECUTION_TIME,
919+
);
920+
921+
afterAll(async () => {
922+
await model?.dispose();
923+
}, MAX_MODEL_DISPOSE_TIME);
924+
});
925+
});
926+
836927
describe("vision-encoder-decoder", () => {
837928
describe("VisionEncoderDecoderModel", () => {
838929
const model_id = "hf-internal-testing/tiny-random-VisionEncoderDecoderModel-vit-gpt2";

0 commit comments

Comments
 (0)