@@ -3106,6 +3106,173 @@ def test_qnn_backend_draw_graph(self):
31063106 ), "Generated .dot file does not match the golden file."
31073107
31083108
3109+ class TestExampleLLMScript (TestQNN ):
3110+ def required_envs (self , conditions = None ) -> bool :
3111+ conditions = [] if conditions is None else conditions
3112+ return all (
3113+ [
3114+ self .executorch_root ,
3115+ self .artifact_dir ,
3116+ * conditions ,
3117+ ]
3118+ )
3119+
3120+ def test_llama3_2_1b (self ):
3121+ if not self .required_envs ():
3122+ self .skipTest ("missing required envs" )
3123+ assert (
3124+ self .llama_artifacts is not None
3125+ ), "Please provide path to llama artifacts"
3126+
3127+ prompt = "What is the meaning of life?"
3128+ cmds = [
3129+ "python" ,
3130+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3131+ "--artifact" ,
3132+ self .artifact_dir ,
3133+ "--build_folder" ,
3134+ self .build_folder ,
3135+ "--model" ,
3136+ self .model ,
3137+ "--checkpoint" ,
3138+ f"{ self .llama_artifacts } /consolidated.00.pth" ,
3139+ "--params" ,
3140+ f"{ self .llama_artifacts } /params.json" ,
3141+ "--tokenizer_model" ,
3142+ f"{ self .llama_artifacts } /tokenizer.model" ,
3143+ "--ip" ,
3144+ self .ip ,
3145+ "--port" ,
3146+ str (self .port ),
3147+ "--prompt" ,
3148+ f"{ prompt } " ,
3149+ "--ptq" ,
3150+ "16a4w" ,
3151+ "--temperature" ,
3152+ "0" ,
3153+ "--llama_model" ,
3154+ "llama3_2" ,
3155+ "--model_mode" ,
3156+ "hybrid" ,
3157+ "--prefill_seq_len" ,
3158+ "32" ,
3159+ "--kv_seq_len" ,
3160+ "512" ,
3161+ "--num_sharding" ,
3162+ "4" ,
3163+ ]
3164+ if self .compile_only :
3165+ cmds .extend (["--compile_only" ])
3166+ elif self .device :
3167+ cmds .extend (["--device" , self .device ])
3168+ if self .host :
3169+ cmds .extend (["--host" , self .host ])
3170+ elif self .enable_x86_64 :
3171+ cmds .extend (["--enable_x86_64" ])
3172+ if self .pre_gen_pte :
3173+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
3174+
3175+ golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
3176+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3177+ with Listener ((self .ip , self .port )) as listener :
3178+ conn = listener .accept ()
3179+ p .communicate ()
3180+ msg = json .loads (conn .recv ())
3181+ if "Error" in msg :
3182+ self .fail (msg ["Error" ])
3183+ else :
3184+ if not self .compile_only :
3185+ model_out = msg ["result" ][0 ]
3186+ self .assertTrue (
3187+ model_out .startswith (golden_start_with ),
3188+ f"Expected Output: { golden_start_with } . Actual Output: { model_out } " ,
3189+ )
3190+ # x86 does not allow weight sharing, so we don't check pte size.
3191+ # Inference speed on x86 is slow, so we only check when running on Android
3192+ if not self .enable_x86_64 :
3193+ pte_size = msg ["pte_size" ]
3194+ self .assertLessEqual (pte_size , 1300000000 )
3195+ if not self .compile_only and not self .enable_x86_64 :
3196+ self .assertGreaterEqual (msg ["inference_speed" ], 66 ) # Lanai
3197+
3198+ def test_llama_stories_110m (self ):
3199+ if not self .required_envs ():
3200+ self .skipTest ("missing required envs" )
3201+ assert (
3202+ self .llama_artifacts is not None
3203+ ), "Please provide path to llama artifacts"
3204+
3205+ prompt = "Once"
3206+ cmds = [
3207+ "python" ,
3208+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3209+ "--artifact" ,
3210+ self .artifact_dir ,
3211+ "--build_folder" ,
3212+ self .build_folder ,
3213+ "--model" ,
3214+ self .model ,
3215+ "--checkpoint" ,
3216+ f"{ self .llama_artifacts } /stories110M.pt" ,
3217+ "--params" ,
3218+ f"{ self .llama_artifacts } /params.json" ,
3219+ "--tokenizer_model" ,
3220+ f"{ self .llama_artifacts } /tokenizer.model" ,
3221+ "--tokenizer_bin" ,
3222+ f"{ self .llama_artifacts } /tokenizer.bin" ,
3223+ "--ip" ,
3224+ self .ip ,
3225+ "--port" ,
3226+ str (self .port ),
3227+ "--prompt" ,
3228+ f"{ prompt } " ,
3229+ "--ptq" ,
3230+ "16a4w" ,
3231+ "--temperature" ,
3232+ "0" ,
3233+ "--llama_model" ,
3234+ "stories110m" ,
3235+ "--model_mode" ,
3236+ "hybrid" ,
3237+ "--prefill_seq_len" ,
3238+ "32" ,
3239+ "--kv_seq_len" ,
3240+ "128" ,
3241+ ]
3242+ if self .compile_only :
3243+ cmds .extend (["--compile_only" ])
3244+ elif self .device :
3245+ cmds .extend (["--device" , self .device ])
3246+ if self .host :
3247+ cmds .extend (["--host" , self .host ])
3248+ elif self .enable_x86_64 :
3249+ cmds .extend (["--enable_x86_64" ])
3250+ if self .pre_gen_pte :
3251+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
3252+
3253+ golden_start_with = "Once upon a time,"
3254+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3255+ with Listener ((self .ip , self .port )) as listener :
3256+ conn = listener .accept ()
3257+ p .communicate ()
3258+ msg = json .loads (conn .recv ())
3259+ if "Error" in msg :
3260+ self .fail (msg ["Error" ])
3261+ else :
3262+ if not self .compile_only :
3263+ model_out = msg ["result" ][0 ]
3264+ self .assertTrue (
3265+ model_out .startswith (golden_start_with ),
3266+ f"Expected Output: { golden_start_with } . Actual Output: { model_out } " ,
3267+ )
3268+ # x86 does not allow weight sharing, so we don't check pte size
3269+ if not self .enable_x86_64 :
3270+ pte_size = msg ["pte_size" ]
3271+ self .assertLessEqual (pte_size , 130000000 )
3272+ if not self .compile_only and not self .enable_x86_64 :
3273+ self .assertGreaterEqual (msg ["inference_speed" ], 220 ) # Lanai
3274+
3275+
31093276class TestExampleOssScript (TestQNN ):
31103277 def required_envs (self , conditions = None ) -> bool :
31113278 conditions = [] if conditions is None else conditions
@@ -4001,72 +4168,6 @@ def test_deeplab_v3(self):
40014168 self .assertGreaterEqual (msg ["MPA" ], 0.70 )
40024169 self .assertGreaterEqual (msg ["MIoU" ], 0.55 )
40034170
4004- def test_stories_single_llama (self ):
4005- if not self .required_envs ():
4006- self .skipTest ("missing required envs" )
4007-
4008- cmds = [
4009- "python" ,
4010- f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
4011- "--artifact" ,
4012- self .artifact_dir ,
4013- "--build_folder" ,
4014- self .build_folder ,
4015- "--model" ,
4016- self .model ,
4017- "--checkpoint" ,
4018- f"{ self .artifact_dir } /stories110M.pt" ,
4019- "--params" ,
4020- f"{ self .artifact_dir } /params.json" ,
4021- "--tokenizer_model" ,
4022- f"{ self .artifact_dir } /tokenizer.model" ,
4023- "--tokenizer_bin" ,
4024- f"{ self .artifact_dir } /tokenizer.bin" ,
4025- "--ip" ,
4026- self .ip ,
4027- "--port" ,
4028- str (self .port ),
4029- "--prompt" ,
4030- "Once" ,
4031- "--ptq" ,
4032- "16a4w" ,
4033- "--temperature" ,
4034- "0" ,
4035- "--llama_model" ,
4036- "stories110m" ,
4037- "--model_mode" ,
4038- "hybrid" ,
4039- "--prefill_seq_len" ,
4040- "32" ,
4041- "--kv_seq_len" ,
4042- "128" ,
4043- ]
4044- if self .compile_only :
4045- cmds .extend (["--compile_only" ])
4046- elif self .device :
4047- cmds .extend (["--device" , self .device ])
4048- if self .host :
4049- cmds .extend (["--host" , self .host ])
4050- elif self .enable_x86_64 :
4051- cmds .extend (["--enable_x86_64" ])
4052-
4053- golden_start_with = "Once upon a time,"
4054- p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
4055- with Listener ((self .ip , self .port )) as listener :
4056- conn = listener .accept ()
4057- p .communicate ()
4058- msg = json .loads (conn .recv ())
4059- if "Error" in msg :
4060- self .fail (msg ["Error" ])
4061- else :
4062- if not self .compile_only :
4063- model_out = msg ["result" ][0 ]
4064- self .assertTrue (model_out .startswith (golden_start_with ))
4065- # x86 does not allow weight sharing, so we don't check pte size
4066- if not self .enable_x86_64 :
4067- pte_size = msg ["pte_size" ]
4068- self .assertLessEqual (pte_size , 130000000 )
4069-
40704171 @unittest .skip ("dynamic shape inputs appear in recent torch.export.export" )
40714172 def test_mobilebert (self ):
40724173 if not self .required_envs ([self .pretrained_weight ]):
@@ -4271,6 +4372,18 @@ def setup_environment():
42714372 type = str ,
42724373 )
42734374
4375+ parser .add_argument (
4376+ "--pre_gen_pte" ,
4377+ help = "Run the pre-generated pte in the given directory." ,
4378+ type = str ,
4379+ )
4380+
4381+ parser .add_argument (
4382+ "--llama_artifacts" ,
4383+ help = "A folder that contains: weight, tokenizer, and params." ,
4384+ type = str ,
4385+ )
4386+
42744387 args , ns_args = parser .parse_known_args (namespace = unittest )
42754388 TestQNN .host = args .host
42764389 TestQNN .device = args .device
@@ -4289,6 +4402,8 @@ def setup_environment():
42894402 TestQNN .enable_x86_64 = args .enable_x86_64
42904403 TestQNN .dump_intermediate_outputs = args .dump_intermediate_outputs
42914404 TestQNN .compile_only = args .compile_only
4405+ TestQNN .pre_gen_pte = args .pre_gen_pte
4406+ TestQNN .llama_artifacts = args .llama_artifacts
42924407
42934408 return sys .argv [:1 ] + ns_args
42944409
0 commit comments