@@ -6490,70 +6490,55 @@ def test_qwen2_5(self):
64906490
64916491
64926492class TestExampleMultimodalityScript (TestQNN ):
6493- def test_smolvlm_500m_instruct (self ):
6494- if not self .required_envs ():
6495- self .skipTest ("missing required envs" )
64966493
6497- prompt = "Can you describe this image?"
6498- cmds = [
6499- "python" ,
6500- f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
6501- "--artifact" ,
6502- self .artifact_dir ,
6503- "--build_folder" ,
6504- self .build_folder ,
6505- "--model" ,
6506- self .model ,
6507- "--ip" ,
6508- self .ip ,
6509- "--port" ,
6510- str (self .port ),
6511- "--prompt" ,
6512- prompt ,
6513- "--temperature" ,
6514- "0" ,
6515- "--decoder_model" ,
6516- "smolvlm_500m_instruct" ,
6517- "--model_mode" ,
6518- "kv" ,
6519- "--max_seq_len" ,
6520- "128" ,
6521- ]
6522- if self .compile_only :
6523- cmds .extend (["--compile_only" ])
6524- elif self .device :
6525- cmds .extend (["--device" , self .device ])
6526- if self .host :
6527- cmds .extend (["--host" , self .host ])
6528- elif self .enable_x86_64 :
6529- cmds .extend (["--enable_x86_64" ])
6530- if self .pre_gen_pte :
6531- cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
6494+ @dataclass (frozen = True )
6495+ class MLLMSpecs :
6496+ max_seq_len : int
6497+ sm8650_token_rate : float
6498+ sm8750_token_rate : float
6499+ encoder_pte_size : float
6500+ text_embedding_pte_size : float
6501+ decoder_pte_size : float
65326502
6533- p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
6534- with Listener ((self .ip , self .port )) as listener :
6535- conn = listener .accept ()
6536- p .communicate ()
6537- msg = json .loads (conn .recv ())
6538- if "Error" in msg :
6539- self .fail (msg ["Error" ])
6540- else :
6541- if not self .enable_x86_64 :
6542- encoder_pte_size = msg ["encoder_pte_size" ]
6543- text_embedding_pte_size = msg ["text_embedding_pte_size" ]
6544- decoder_pte_size = msg ["pte_size" ]
6545- self .assertLessEqual (encoder_pte_size , 110_000_000 ) # 110MB
6546- self .assertLessEqual (text_embedding_pte_size , 100_000_000 ) # 100MB
6547- self .assertLessEqual (decoder_pte_size , 400_000_000 ) # 400MB
6548- print (f"Encoder PTE Size: { encoder_pte_size } bytes" )
6549- print (f"Text Embedding PTE Size: { text_embedding_pte_size } bytes" )
6550- print (f"Decoder PTE Size: { decoder_pte_size } bytes" )
6503+ @dataclass (frozen = True )
6504+ class VLMSpecs (MLLMSpecs ):
6505+ image_path : str
6506+ golden_image_feature : str
65516507
6552- def test_internvl3_1b (self ):
6553- if not self .required_envs ():
6508+ # TODO: refactor to support different backends
6509+ def setUp (self ):
6510+ self .vlm_specs = {
6511+ "smolvlm_500m_instruct" : TestExampleMultimodalityScript .VLMSpecs (
6512+ max_seq_len = 128 ,
6513+ sm8650_token_rate = 50 ,
6514+ sm8750_token_rate = 55 ,
6515+ encoder_pte_size = 110_000_000 , # 110MB
6516+ text_embedding_pte_size = 100_000_000 , # 100MB
6517+ decoder_pte_size = 400_000_000 , # 400MB
6518+ image_path = "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg" , # New York Bay
6519+ golden_image_feature = "city" ,
6520+ ),
6521+ "internvl3_1b" : TestExampleMultimodalityScript .VLMSpecs (
6522+ max_seq_len = 320 ,
6523+ sm8650_token_rate = 11 ,
6524+ sm8750_token_rate = 13 ,
6525+ encoder_pte_size = 425_000_000 , # 425MB
6526+ text_embedding_pte_size = 300_000_000 , # 300MB
6527+ decoder_pte_size = 550_000_000 , # 550 MB
6528+ image_path = "http://images.cocodataset.org/val2017/000000039769.jpg" , # Two cats lying on a blanket
6529+ golden_image_feature = "cats" ,
6530+ ),
6531+ }
6532+
6533+ def test_static_vlm (self ):
6534+ if not self .required_envs ([self .model_name ]):
65546535 self .skipTest ("missing required envs" )
65556536
6537+ vlm_specs : TestExampleMultimodalityScript .VLMSpecs = self .vlm_specs [
6538+ self .model_name
6539+ ]
65566540 prompt = "Can you describe this image?"
6541+ image_path = vlm_specs .image_path
65576542 cmds = [
65586543 "python" ,
65596544 f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
@@ -6569,14 +6554,16 @@ def test_internvl3_1b(self):
65696554 str (self .port ),
65706555 "--prompt" ,
65716556 prompt ,
6557+ "--image_path" ,
6558+ image_path ,
65726559 "--temperature" ,
65736560 "0" ,
65746561 "--decoder_model" ,
6575- "internvl3_1b " ,
6562+ f" { self . model_name } " ,
65766563 "--model_mode" ,
65776564 "kv" ,
65786565 "--max_seq_len" ,
6579- "320 " ,
6566+ f" { vlm_specs . max_seq_len } " ,
65806567 ]
65816568 if self .compile_only :
65826569 cmds .extend (["--compile_only" ])
@@ -6597,17 +6584,41 @@ def test_internvl3_1b(self):
65976584 if "Error" in msg :
65986585 self .fail (msg ["Error" ])
65996586 else :
6587+ if not self .compile_only :
6588+ model_out = msg ["result" ][0 ]
6589+ self .assertTrue (
6590+ vlm_specs .golden_image_feature in model_out ,
6591+ f"Expected Output contains feature: '{ vlm_specs .golden_image_feature } ' Actual Output: '{ model_out } '" ,
6592+ )
6593+ print (f"Image Path: { image_path } " )
6594+ print (f"Query: { prompt } " )
6595+ print (f"Answer: { model_out } " )
66006596 if not self .enable_x86_64 :
66016597 encoder_pte_size = msg ["encoder_pte_size" ]
66026598 text_embedding_pte_size = msg ["text_embedding_pte_size" ]
66036599 decoder_pte_size = msg ["pte_size" ]
6604- self .assertLessEqual (encoder_pte_size , 425_000_000 ) # 425MB
6605- self .assertLessEqual (text_embedding_pte_size , 300_000_000 ) # 300MB
6606- self .assertLessEqual (decoder_pte_size , 550_000_000 ) # 550MB
6600+ self .assertLessEqual (encoder_pte_size , vlm_specs .encoder_pte_size )
6601+ self .assertLessEqual (
6602+ text_embedding_pte_size , vlm_specs .text_embedding_pte_size
6603+ )
6604+ self .assertLessEqual (decoder_pte_size , vlm_specs .decoder_pte_size )
66076605 print (f"Encoder PTE Size: { encoder_pte_size } bytes" )
66086606 print (f"Text Embedding PTE Size: { text_embedding_pte_size } bytes" )
66096607 print (f"Decoder PTE Size: { decoder_pte_size } bytes" )
66106608
6609+ attr_name = f"{ self .model .lower ()} _token_rate"
6610+ if (
6611+ not self .compile_only
6612+ and not self .enable_x86_64
6613+ and hasattr (vlm_specs , attr_name )
6614+ ):
6615+ device_inference_speed = msg ["inference_speed" ]
6616+ expected_inference_speed = getattr (vlm_specs , attr_name )
6617+ print (f"Prompt Evaluation: { device_inference_speed } tokens/second" )
6618+ self .assertGreaterEqual (
6619+ device_inference_speed , expected_inference_speed
6620+ )
6621+
66116622
66126623class TestExampleOssScript (TestQNN ):
66136624 def test_albert (self ):
0 commit comments