test: add random image test for llama-3.2-11b-vision (NVIDIA#3055)

crazydemo · web-flow · commit 3e116c96877d · 2025-03-26T15:38:16.000+08:00
* add random image test for llama-3.2-11b-vision

Signed-off-by: Ivy Zhang &lt;yanzh@nvidia.com&gt;

* rename case

Signed-off-by: Ivy Zhang &lt;yanzh@nvidia.com&gt;

---------

Signed-off-by: Ivy Zhang &lt;yanzh@nvidia.com&gt;
Co-authored-by: Larry &lt;larryx@nvidia.com&gt;

CI got Passed: https://nv/trt-llm-cicd/job/helpers/job/PR_Github/522/
diff --git a/examples/multimodal/run.py b/examples/multimodal/run.py
@@ -52,6 +52,10 @@ def print_result(model, input_text, output_text, args):
                     ref_1 = ", it would be:.\\nPeter Rabbit is a rabbit.\\nHe lives in a cozy little house.\\nHe's a very good rabbit.\\"
                     ref_2 = "Here is a haiku for the image:\n\n"
 
+                elif "Answer:" in input_text:
+                    ref_1 = "2,173. <OCR/> A 1 2 3 4 5 6 Date Income 2005-12-17"
+                    ref_2 = "Answer: 2,173. <OCR/> 1 2 3 4 5 6 Date Income 2005-12-17"
+
                 elif "The key to life is" in input_text:
                     ref_1 = "to find your passion and pursue it with all your heart. For me, that passion is photography. I love capturing the beauty of the world around me"
                     ref_2 = "not to be found in the external world,"
diff --git a/tests/integration/defs/examples/test_multimodal.py b/tests/integration/defs/examples/test_multimodal.py
@@ -478,9 +478,24 @@ def _test_llm_multimodal_general(llm_venv,
                 "If I had to write a haiku for this one"
             ])
 
-            print("Run mllama vision test...")
+            print("Run mllama vision test in with example image ...")
             _call_run_cmd(llm_venv, llm_root, run_cmd_vision, world_size)
 
+            print("multimodal_example_root: ", multimodal_example_root)
+            print("llm_root: ", llm_root)
+            run_cmd_vision = run_cmd.copy()
+            run_cmd_vision.extend([
+                "--cross_kv_cache_fraction=0.5",  # mllama uses cross attention
+                "--image_path",
+                os.path.join(
+                    llm_root,
+                    "tests/integration/test_input_files/excel_table_test.jpg"),
+                "--input_text",
+                "What is the total income? Answer:"
+            ])
+
+            print("Run mllama vision test with random image ...")
+
             run_cmd_text = run_cmd.copy()
             run_cmd_text.extend([
                 "--cross_kv_cache_fraction=0.5",  # mllama uses cross attention
diff --git a/tests/integration/test_input_files/excel_table_test.jpg b/tests/integration/test_input_files/excel_table_test.jpg