Skip to content

Commit 11e4b5e

Browse files
make some ut cases pass on xpu w/ latest torch (#41337)
* make some ut cases pass on xpu w/ latest torch Signed-off-by: Yao, Matrix <[email protected]> * Update test_modeling_llava_onevision.py * Apply style fixes --------- Signed-off-by: Yao, Matrix <[email protected]> Co-authored-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
1 parent fa36c97 commit 11e4b5e

File tree

4 files changed

+36
-15
lines changed

4 files changed

+36
-15
lines changed

tests/models/aya_vision/test_modeling_aya_vision.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ def test_small_model_integration_batched_generate(self):
393393
decoded_output = processor.decode(output[0, inputs["input_ids"].shape[1] :], skip_special_tokens=True)
394394
expected_outputs = Expectations(
395395
{
396-
("xpu", 3): "Wooden path to water,\nMountains echo in stillness,\nPeaceful forest lake.",
396+
("xpu", 3): "Wooden bridge stretches\nInto still waters, mountains gleam\nPeaceful forest scene",
397397
# 4-bit
398398
("cuda", 7): "Wooden bridge stretches\nMirrored lake below, mountains rise\nPeaceful, serene",
399399
("cuda", 8): 'Wooden path to water,\nMountains echo in stillness,\nPeaceful forest scene.',
@@ -412,7 +412,7 @@ def test_small_model_integration_batched_generate(self):
412412

413413
expected_outputs = Expectations(
414414
{
415-
("xpu", 3): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',
415+
("xpu", 3): 'This vibrant image captures a bustling street scene in a Chinese-influenced neighborhood. The focal point is a striking red stop sign',
416416
# 4-bit
417417
("cuda", 7): 'This vibrant image captures a bustling street scene in a multicultural urban area, featuring a traditional Chinese gate adorned with intricate red and',
418418
("cuda", 8): 'This image captures a vibrant street scene in a bustling urban area, likely in an Asian city. The focal point is a',

tests/models/gemma3/test_modeling_gemma3.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -446,8 +446,8 @@ def test_model_4b_batch(self):
446446
{
447447
("xpu", 3):
448448
[
449-
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown and white cow standing on a sandy beach next to a turquoise ocean. It looks like a very sunny and',
450-
'user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. They depict very different scenes:\n\n* **Image 1** shows a cow standing on a beach.',
449+
'user\nYou are a helpful assistant.\n\n\n\n\n\nWhat is shown in this image?\nmodel\nCertainly! \n\nThe image shows a brown cow standing on a sandy beach with turquoise water and a blue sky in the background. It looks like a',
450+
"user\nYou are a helpful assistant.\n\n\n\n\n\n\n\n\n\nAre these images identical?\nmodel\nNo, these images are not identical. \n\nHere's a breakdown of the differences:\n\n* **Image 1:** Shows a brown",
451451
],
452452
("cuda", (8,0)):
453453
[
@@ -567,9 +567,8 @@ def test_model_4b_batch_crops(self):
567567
EXPECTED_TEXTS = Expectations(
568568
{
569569
("xpu", 3): [
570-
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There are clouds in the blue sky above.',
571-
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a',
572-
],
570+
"user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a bright blue sky with some white clouds in the",
571+
'user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nAre these images identical?\nmodel\nNo, the images are not identical. \n\nThe first image shows a cow on a beach, while the second image shows a street scene with a'],
573572
("cuda", 7): [],
574573
("cuda", (8,0)): [
575574
"user\nYou are a helpful assistant.\n\nHere is the original image \n\n\n\n and here are some crops to help you see better \n\n\n\n \n\n\n\nWhat is shown in this image?\nmodel\nThe image shows a brown cow standing on a sandy beach next to a turquoise ocean. There's a blue sky with some white clouds in the background",
@@ -627,7 +626,7 @@ def test_model_4b_multiimage(self):
627626
output_text = self.processor.batch_decode(output, skip_special_tokens=True)
628627
EXPECTED_TEXTS = Expectations(
629628
{
630-
("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image!\n\nHere's a description of the scene:\n\n* **Chinese Arch"],
629+
("xpu", 3): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a city with"],
631630
("cuda", 7): [],
632631
("cuda", (8, 0)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt looks like a street scene in a vibrant,"],
633632
("cuda", (8, 6)): ["user\nYou are a helpful assistant.\n\n\n\n\n\nWhat do you see here?\nmodel\nOkay, let's break down what I see in this image:\n\n**Overall Scene:**\n\nIt appears to be a street scene in a city"],

tests/models/llava/test_modeling_llava.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -626,7 +626,7 @@ def test_pixtral_4bit(self):
626626
EXPECTED_GENERATIONS = Expectations(
627627
{
628628
("cuda", 7): "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador",
629-
("xpu", 3): "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which covers the entire background. The dog appears to be the main focus",
629+
("xpu", 3): "Describe the images.The image showcases a dog, which is prominently positioned in the center, taking up a significant portion of the frame. The dog is situated against a backdrop of a wooden surface, which spans the entire image. The dog appears to be a black Labrador",
630630
("rocm", (9, 5)): "Describe the images.The image features a dog positioned centrally, taking up a significant portion of the frame. The dog is situated against a backdrop of rugged terrain, which includes rocky cliffs and grassy slopes. The dog appears to be in a relaxed posture, possibly looking directly",
631631
}
632632
) # fmt: skip

tests/models/llava_onevision/test_modeling_llava_onevision.py

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -408,12 +408,18 @@ def test_small_model_integration_test_multi_image(self):
408408

409409
# verify generation
410410
output = model.generate(**inputs, max_new_tokens=40)
411-
EXPECTED_DECODED_TEXT = "user\n\nWhat is the difference between these images?\nassistant\nThe images you've provided appear to be related to a graphical representation of a radar chart, which is a type of data visualization used to show the distribution of a particular variable across a geographic area. The" # fmt: skip
412-
413-
self.assertEqual(
414-
self.processor.decode(output[0], skip_special_tokens=True),
415-
EXPECTED_DECODED_TEXT,
411+
output_text = self.processor.decode(output[0], skip_special_tokens=True)
412+
# fmt: off
413+
EXPECTED_DECODED_TEXTS = Expectations(
414+
{
415+
("cuda", None): "user\n\nWhat is the difference between these images?\nassistant\nThe images you've provided appear to be related to a graphical representation of a radar chart, which is a type of data visualization used to show the distribution of a particular variable across a geographic area. The",
416+
("xpu", 3): "user\n\nWhat is the difference between these images?\nassistant\nThe images you've provided appear to be related to a graphical representation of a radar chart, which is a type of data visualization used to show the distribution of a particular variable across a geographic area. The",
417+
}
416418
)
419+
EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
420+
# fmt: on
421+
422+
self.assertEqual(output_text, EXPECTED_DECODED_TEXT)
417423

418424
@slow
419425
@require_bitsandbytes
@@ -442,7 +448,23 @@ def test_small_model_integration_test_multi_image_nested(self):
442448

443449
# verify generation
444450
output = model.generate(**inputs, max_new_tokens=40)
445-
EXPECTED_DECODED_TEXT = ["user\nTell me about the french revolution.\nassistant\nThe French Revolution! A pivotal event in modern history that had a profound impact on the course of Western civilization. Here's a brief overview:\n\n**Background**\n\nIn the late 18th century,", "user\n\nWhat is the difference between these images?\nassistant\nThe first image shows a stop sign with a traditional Chinese architectural background, while the second image displays a radar chart with various algorithms and models, including BLIP-2, InstructBLIP, Q", "user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different"] # fmt: skip
451+
# fmt: off
452+
EXPECTED_DECODED_TEXTS = Expectations(
453+
{
454+
("cuda", None): [
455+
"user\nTell me about the french revolution.\nassistant\nThe French Revolution! A pivotal event in modern history that had a profound impact on the course of Western civilization. Here's a brief overview:\n\n**Background**\n\nIn the late 18th century,",
456+
"user\n\nWhat is the difference between these images?\nassistant\nThe first image shows a stop sign with a traditional Chinese architectural background, while the second image displays a radar chart with various algorithms and models, including BLIP-2, InstructBLIP, Q",
457+
"user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that compares the performance of different models in a specific task, likely related to natural language processing or machine learning. The chart is divided into several axes, each representing a different"
458+
],
459+
("xpu", 3): [
460+
"user\nTell me about the french revolution.\nassistant\nThe French Revolution! A pivotal event in modern history that had a profound impact on the course of Western civilization. Here's a brief overview:\n\n**Background**\n\nIn the late 18th century,",
461+
'user\n\nWhat is the difference between these images?\nassistant\nThe image shows a traffic light with a stop sign in the foreground, while the other images show a car driving through a street intersection.',
462+
'user\n\nWhat do you see in this image?\nassistant\nThe image is a radar chart that represents the performance of different machine learning models in terms of their ability to predict the number of users who have been infected with COVID-19. The radar chart is'
463+
],
464+
}
465+
)
466+
EXPECTED_DECODED_TEXT = EXPECTED_DECODED_TEXTS.get_expectation()
467+
# fmt: on
446468
DECODED_TEXT = self.processor.batch_decode(output, skip_special_tokens=True)
447469

448470
self.assertListEqual(DECODED_TEXT, EXPECTED_DECODED_TEXT)

0 commit comments

Comments
 (0)