Skip to content

Commit a945327

Browse files
Qualcomm AI Engine Direct - Support multimodal(VLM) runner
Summary: - Runtime support for models - SmolVLM 500M - InternVL3 1B - add hybrid mode runtime requantization in multimodal runner - CI - refactor VLM test script - add VLM acc/perf runtime tests - Refactor(VLM) - rename embedding forward input for CPU quantization - Update VLM vision encoder architecture to align with upcoming transformers 5.0 changes - Documentation - add readme for multimodal VLM
1 parent 20680c6 commit a945327

34 files changed

+3845
-240
lines changed

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 74 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -6490,70 +6490,55 @@ def test_qwen2_5(self):
64906490

64916491

64926492
class TestExampleMultimodalityScript(TestQNN):
6493-
def test_smolvlm_500m_instruct(self):
6494-
if not self.required_envs():
6495-
self.skipTest("missing required envs")
64966493

6497-
prompt = "Can you describe this image?"
6498-
cmds = [
6499-
"python",
6500-
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
6501-
"--artifact",
6502-
self.artifact_dir,
6503-
"--build_folder",
6504-
self.build_folder,
6505-
"--model",
6506-
self.model,
6507-
"--ip",
6508-
self.ip,
6509-
"--port",
6510-
str(self.port),
6511-
"--prompt",
6512-
prompt,
6513-
"--temperature",
6514-
"0",
6515-
"--decoder_model",
6516-
"smolvlm_500m_instruct",
6517-
"--model_mode",
6518-
"kv",
6519-
"--max_seq_len",
6520-
"128",
6521-
]
6522-
if self.compile_only:
6523-
cmds.extend(["--compile_only"])
6524-
elif self.device:
6525-
cmds.extend(["--device", self.device])
6526-
if self.host:
6527-
cmds.extend(["--host", self.host])
6528-
elif self.enable_x86_64:
6529-
cmds.extend(["--enable_x86_64"])
6530-
if self.pre_gen_pte:
6531-
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
6494+
@dataclass(frozen=True)
6495+
class MLLMSpecs:
6496+
max_seq_len: int
6497+
sm8650_token_rate: float
6498+
sm8750_token_rate: float
6499+
encoder_pte_size: float
6500+
text_embedding_pte_size: float
6501+
decoder_pte_size: float
65326502

6533-
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
6534-
with Listener((self.ip, self.port)) as listener:
6535-
conn = listener.accept()
6536-
p.communicate()
6537-
msg = json.loads(conn.recv())
6538-
if "Error" in msg:
6539-
self.fail(msg["Error"])
6540-
else:
6541-
if not self.enable_x86_64:
6542-
encoder_pte_size = msg["encoder_pte_size"]
6543-
text_embedding_pte_size = msg["text_embedding_pte_size"]
6544-
decoder_pte_size = msg["pte_size"]
6545-
self.assertLessEqual(encoder_pte_size, 110_000_000) # 110MB
6546-
self.assertLessEqual(text_embedding_pte_size, 100_000_000) # 100MB
6547-
self.assertLessEqual(decoder_pte_size, 400_000_000) # 400MB
6548-
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
6549-
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
6550-
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
6503+
@dataclass(frozen=True)
6504+
class VLMSpecs(MLLMSpecs):
6505+
image_path: str
6506+
golden_image_feature: str
65516507

6552-
def test_internvl3_1b(self):
6553-
if not self.required_envs():
6508+
# TODO: refactor to support different backends
6509+
def setUp(self):
6510+
self.vlm_specs = {
6511+
"smolvlm_500m_instruct": TestExampleMultimodalityScript.VLMSpecs(
6512+
max_seq_len=128,
6513+
sm8650_token_rate=50,
6514+
sm8750_token_rate=55,
6515+
encoder_pte_size=110_000_000, # 110MB
6516+
text_embedding_pte_size=100_000_000, # 100MB
6517+
decoder_pte_size=400_000_000, # 400MB
6518+
image_path="https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg", # New York Bay
6519+
golden_image_feature="city",
6520+
),
6521+
"internvl3_1b": TestExampleMultimodalityScript.VLMSpecs(
6522+
max_seq_len=320,
6523+
sm8650_token_rate=11,
6524+
sm8750_token_rate=13,
6525+
encoder_pte_size=425_000_000, # 425MB
6526+
text_embedding_pte_size=300_000_000, # 300MB
6527+
decoder_pte_size=550_000_000, # 550 MB
6528+
image_path="http://images.cocodataset.org/val2017/000000039769.jpg", # Two cats lying on a blanket
6529+
golden_image_feature="cats",
6530+
),
6531+
}
6532+
6533+
def test_static_vlm(self):
6534+
if not self.required_envs([self.model_name]):
65546535
self.skipTest("missing required envs")
65556536

6537+
vlm_specs: TestExampleMultimodalityScript.VLMSpecs = self.vlm_specs[
6538+
self.model_name
6539+
]
65566540
prompt = "Can you describe this image?"
6541+
image_path = vlm_specs.image_path
65576542
cmds = [
65586543
"python",
65596544
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
@@ -6569,14 +6554,16 @@ def test_internvl3_1b(self):
65696554
str(self.port),
65706555
"--prompt",
65716556
prompt,
6557+
"--image_path",
6558+
image_path,
65726559
"--temperature",
65736560
"0",
65746561
"--decoder_model",
6575-
"internvl3_1b",
6562+
f"{self.model_name}",
65766563
"--model_mode",
65776564
"kv",
65786565
"--max_seq_len",
6579-
"320",
6566+
f"{vlm_specs.max_seq_len}",
65806567
]
65816568
if self.compile_only:
65826569
cmds.extend(["--compile_only"])
@@ -6597,17 +6584,41 @@ def test_internvl3_1b(self):
65976584
if "Error" in msg:
65986585
self.fail(msg["Error"])
65996586
else:
6587+
if not self.compile_only:
6588+
model_out = msg["result"][0]
6589+
self.assertTrue(
6590+
vlm_specs.golden_image_feature in model_out,
6591+
f"Expected Output contains feature: '{vlm_specs.golden_image_feature}' Actual Output: '{model_out}'",
6592+
)
6593+
print(f"Image Path: {image_path}")
6594+
print(f"Query: {prompt}")
6595+
print(f"Answer: {model_out}")
66006596
if not self.enable_x86_64:
66016597
encoder_pte_size = msg["encoder_pte_size"]
66026598
text_embedding_pte_size = msg["text_embedding_pte_size"]
66036599
decoder_pte_size = msg["pte_size"]
6604-
self.assertLessEqual(encoder_pte_size, 425_000_000) # 425MB
6605-
self.assertLessEqual(text_embedding_pte_size, 300_000_000) # 300MB
6606-
self.assertLessEqual(decoder_pte_size, 550_000_000) # 550MB
6600+
self.assertLessEqual(encoder_pte_size, vlm_specs.encoder_pte_size)
6601+
self.assertLessEqual(
6602+
text_embedding_pte_size, vlm_specs.text_embedding_pte_size
6603+
)
6604+
self.assertLessEqual(decoder_pte_size, vlm_specs.decoder_pte_size)
66076605
print(f"Encoder PTE Size: {encoder_pte_size} bytes")
66086606
print(f"Text Embedding PTE Size: {text_embedding_pte_size} bytes")
66096607
print(f"Decoder PTE Size: {decoder_pte_size} bytes")
66106608

6609+
attr_name = f"{self.model.lower()}_token_rate"
6610+
if (
6611+
not self.compile_only
6612+
and not self.enable_x86_64
6613+
and hasattr(vlm_specs, attr_name)
6614+
):
6615+
device_inference_speed = msg["inference_speed"]
6616+
expected_inference_speed = getattr(vlm_specs, attr_name)
6617+
print(f"Prompt Evaluation: {device_inference_speed} tokens/second")
6618+
self.assertGreaterEqual(
6619+
device_inference_speed, expected_inference_speed
6620+
)
6621+
66116622

66126623
class TestExampleOssScript(TestQNN):
66136624
def test_albert(self):

examples/qualcomm/oss_scripts/llama/CMakeLists.txt

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,3 +81,60 @@ target_compile_options(qnn_llama_runner PUBLIC ${_common_compile_options})
8181
set_target_properties(
8282
qnn_llama_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
8383
)
84+
85+
# build qnn multimodal runner preprocess qnn runner src files for multimodal
86+
set(_multimodal_runner__srcs ${_llama_runner__srcs})
87+
list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*qnn_llama_runner.*")
88+
list(FILTER _multimodal_runner__srcs EXCLUDE REGEX ".*runner/runner\.(cpp|h)")
89+
list(
90+
PREPEND
91+
_multimodal_runner__srcs
92+
${CMAKE_CURRENT_LIST_DIR}/qnn_multimodal_runner.cpp
93+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.cpp
94+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_runner.h
95+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.cpp
96+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/encoder.h
97+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.cpp
98+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_runner.h
99+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.cpp
100+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/embedding_processor.h
101+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.cpp
102+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_prompt_processor.h
103+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.cpp
104+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_token_generator.h
105+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.cpp
106+
${CMAKE_CURRENT_LIST_DIR}/runner/multimodal_runner/multimodal_lhd_token_generator.h
107+
)
108+
109+
list(APPEND _multimodal_runner__srcs)
110+
111+
# build qnn multimodal runner
112+
add_executable(qnn_multimodal_runner ${_multimodal_runner__srcs})
113+
target_include_directories(
114+
qnn_multimodal_runner PUBLIC ${_common_include_directories}
115+
)
116+
target_include_directories(
117+
qnn_multimodal_runner
118+
PUBLIC ${EXECUTORCH_ROOT}/extension/llm/tokenizers/include
119+
)
120+
target_compile_options(qnn_multimodal_runner PUBLIC ${_common_compile_options})
121+
122+
target_link_libraries(
123+
qnn_multimodal_runner
124+
qnn_executorch_backend
125+
executorch_core
126+
extension_data_loader
127+
extension_flat_tensor
128+
extension_llm_runner
129+
extension_module
130+
extension_tensor
131+
gflags
132+
custom_ops
133+
quantized_ops_lib
134+
quantized_kernels
135+
tokenizers::tokenizers
136+
)
137+
138+
set_target_properties(
139+
qnn_multimodal_runner PROPERTIES LINK_FLAGS "-Wl,-rpath='$ORIGIN'"
140+
)

0 commit comments

Comments
 (0)