@@ -2991,6 +2991,173 @@ def test_qnn_backend_draw_graph(self):
29912991 ), "Generated .dot file does not match the golden file."
29922992
29932993
2994+ class TestExampleLLMScript (TestQNN ):
2995+ def required_envs (self , conditions = None ) -> bool :
2996+ conditions = [] if conditions is None else conditions
2997+ return all (
2998+ [
2999+ self .executorch_root ,
3000+ self .artifact_dir ,
3001+ * conditions ,
3002+ ]
3003+ )
3004+
3005+ def test_llama3_2_1b (self ):
3006+ if not self .required_envs ():
3007+ self .skipTest ("missing required envs" )
3008+ assert (
3009+ self .llama_artifacts is not None
3010+ ), "Please provide path to llama artifacts"
3011+
3012+ prompt = "What is the meaning of life?"
3013+ cmds = [
3014+ "python" ,
3015+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3016+ "--artifact" ,
3017+ self .artifact_dir ,
3018+ "--build_folder" ,
3019+ self .build_folder ,
3020+ "--model" ,
3021+ self .model ,
3022+ "--checkpoint" ,
3023+ f"{ self .llama_artifacts } /consolidated.00.pth" ,
3024+ "--params" ,
3025+ f"{ self .llama_artifacts } /params.json" ,
3026+ "--tokenizer_model" ,
3027+ f"{ self .llama_artifacts } /tokenizer.model" ,
3028+ "--ip" ,
3029+ self .ip ,
3030+ "--port" ,
3031+ str (self .port ),
3032+ "--prompt" ,
3033+ f"{ prompt } " ,
3034+ "--ptq" ,
3035+ "16a4w" ,
3036+ "--temperature" ,
3037+ "0" ,
3038+ "--llama_model" ,
3039+ "llama3_2" ,
3040+ "--model_mode" ,
3041+ "hybrid" ,
3042+ "--prefill_seq_len" ,
3043+ "32" ,
3044+ "--kv_seq_len" ,
3045+ "512" ,
3046+ "--num_sharding" ,
3047+ "4" ,
3048+ ]
3049+ if self .compile_only :
3050+ cmds .extend (["--compile_only" ])
3051+ elif self .device :
3052+ cmds .extend (["--device" , self .device ])
3053+ if self .host :
3054+ cmds .extend (["--host" , self .host ])
3055+ elif self .enable_x86_64 :
3056+ cmds .extend (["--enable_x86_64" ])
3057+ if self .pre_gen_pte :
3058+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
3059+
3060+ golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
3061+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3062+ with Listener ((self .ip , self .port )) as listener :
3063+ conn = listener .accept ()
3064+ p .communicate ()
3065+ msg = json .loads (conn .recv ())
3066+ if "Error" in msg :
3067+ self .fail (msg ["Error" ])
3068+ else :
3069+ if not self .compile_only :
3070+ model_out = msg ["result" ][0 ]
3071+ self .assertTrue (
3072+ model_out .startswith (golden_start_with ),
3073+ f"Expected Output: { golden_start_with } . Actual Output: { model_out } " ,
3074+ )
3075+ # x86 does not allow weight sharing, so we don't check pte size.
3076+ # Inference speed on x86 is slow, so we only check when running on Android
3077+ if not self .enable_x86_64 :
3078+ pte_size = msg ["pte_size" ]
3079+ self .assertLessEqual (pte_size , 1300000000 )
3080+ if not self .compile_only and not self .enable_x86_64 :
3081+ self .assertGreaterEqual (msg ["inference_speed" ], 66 ) # Lanai
3082+
3083+ def test_llama_stories_110m (self ):
3084+ if not self .required_envs ():
3085+ self .skipTest ("missing required envs" )
3086+ assert (
3087+ self .llama_artifacts is not None
3088+ ), "Please provide path to llama artifacts"
3089+
3090+ prompt = "Once"
3091+ cmds = [
3092+ "python" ,
3093+ f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3094+ "--artifact" ,
3095+ self .artifact_dir ,
3096+ "--build_folder" ,
3097+ self .build_folder ,
3098+ "--model" ,
3099+ self .model ,
3100+ "--checkpoint" ,
3101+ f"{ self .llama_artifacts } /stories110M.pt" ,
3102+ "--params" ,
3103+ f"{ self .llama_artifacts } /params.json" ,
3104+ "--tokenizer_model" ,
3105+ f"{ self .llama_artifacts } /tokenizer.model" ,
3106+ "--tokenizer_bin" ,
3107+ f"{ self .llama_artifacts } /tokenizer.bin" ,
3108+ "--ip" ,
3109+ self .ip ,
3110+ "--port" ,
3111+ str (self .port ),
3112+ "--prompt" ,
3113+ f"{ prompt } " ,
3114+ "--ptq" ,
3115+ "16a4w" ,
3116+ "--temperature" ,
3117+ "0" ,
3118+ "--llama_model" ,
3119+ "stories110m" ,
3120+ "--model_mode" ,
3121+ "hybrid" ,
3122+ "--prefill_seq_len" ,
3123+ "32" ,
3124+ "--kv_seq_len" ,
3125+ "128" ,
3126+ ]
3127+ if self .compile_only :
3128+ cmds .extend (["--compile_only" ])
3129+ elif self .device :
3130+ cmds .extend (["--device" , self .device ])
3131+ if self .host :
3132+ cmds .extend (["--host" , self .host ])
3133+ elif self .enable_x86_64 :
3134+ cmds .extend (["--enable_x86_64" ])
3135+ if self .pre_gen_pte :
3136+ cmds .extend (["--pre_gen_pte" , self .pre_gen_pte ])
3137+
3138+ golden_start_with = "Once upon a time,"
3139+ p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3140+ with Listener ((self .ip , self .port )) as listener :
3141+ conn = listener .accept ()
3142+ p .communicate ()
3143+ msg = json .loads (conn .recv ())
3144+ if "Error" in msg :
3145+ self .fail (msg ["Error" ])
3146+ else :
3147+ if not self .compile_only :
3148+ model_out = msg ["result" ][0 ]
3149+ self .assertTrue (
3150+ model_out .startswith (golden_start_with ),
3151+ f"Expected Output: { golden_start_with } . Actual Output: { model_out } " ,
3152+ )
3153+ # x86 does not allow weight sharing, so we don't check pte size
3154+ if not self .enable_x86_64 :
3155+ pte_size = msg ["pte_size" ]
3156+ self .assertLessEqual (pte_size , 130000000 )
3157+ if not self .compile_only and not self .enable_x86_64 :
3158+ self .assertGreaterEqual (msg ["inference_speed" ], 220 ) # Lanai
3159+
3160+
29943161class TestExampleOssScript (TestQNN ):
29953162 def required_envs (self , conditions = None ) -> bool :
29963163 conditions = [] if conditions is None else conditions
@@ -3886,72 +4053,6 @@ def test_deeplab_v3(self):
38864053 self .assertGreaterEqual (msg ["MPA" ], 0.70 )
38874054 self .assertGreaterEqual (msg ["MIoU" ], 0.55 )
38884055
3889- def test_stories_single_llama (self ):
3890- if not self .required_envs ():
3891- self .skipTest ("missing required envs" )
3892-
3893- cmds = [
3894- "python" ,
3895- f"{ self .executorch_root } /examples/qualcomm/oss_scripts/llama/llama.py" ,
3896- "--artifact" ,
3897- self .artifact_dir ,
3898- "--build_folder" ,
3899- self .build_folder ,
3900- "--model" ,
3901- self .model ,
3902- "--checkpoint" ,
3903- f"{ self .artifact_dir } /stories110M.pt" ,
3904- "--params" ,
3905- f"{ self .artifact_dir } /params.json" ,
3906- "--tokenizer_model" ,
3907- f"{ self .artifact_dir } /tokenizer.model" ,
3908- "--tokenizer_bin" ,
3909- f"{ self .artifact_dir } /tokenizer.bin" ,
3910- "--ip" ,
3911- self .ip ,
3912- "--port" ,
3913- str (self .port ),
3914- "--prompt" ,
3915- "Once" ,
3916- "--ptq" ,
3917- "16a4w" ,
3918- "--temperature" ,
3919- "0" ,
3920- "--llama_model" ,
3921- "stories110m" ,
3922- "--model_mode" ,
3923- "hybrid" ,
3924- "--prefill_seq_len" ,
3925- "32" ,
3926- "--kv_seq_len" ,
3927- "128" ,
3928- ]
3929- if self .compile_only :
3930- cmds .extend (["--compile_only" ])
3931- elif self .device :
3932- cmds .extend (["--device" , self .device ])
3933- if self .host :
3934- cmds .extend (["--host" , self .host ])
3935- elif self .enable_x86_64 :
3936- cmds .extend (["--enable_x86_64" ])
3937-
3938- golden_start_with = "Once upon a time,"
3939- p = subprocess .Popen (cmds , stdout = subprocess .DEVNULL )
3940- with Listener ((self .ip , self .port )) as listener :
3941- conn = listener .accept ()
3942- p .communicate ()
3943- msg = json .loads (conn .recv ())
3944- if "Error" in msg :
3945- self .fail (msg ["Error" ])
3946- else :
3947- if not self .compile_only :
3948- model_out = msg ["result" ][0 ]
3949- self .assertTrue (model_out .startswith (golden_start_with ))
3950- # x86 does not allow weight sharing, so we don't check pte size
3951- if not self .enable_x86_64 :
3952- pte_size = msg ["pte_size" ]
3953- self .assertLessEqual (pte_size , 130000000 )
3954-
39554056 @unittest .skip ("dynamic shape inputs appear in recent torch.export.export" )
39564057 def test_mobilebert (self ):
39574058 if not self .required_envs ([self .pretrained_weight ]):
@@ -4156,6 +4257,18 @@ def setup_environment():
41564257 type = str ,
41574258 )
41584259
4260+ parser .add_argument (
4261+ "--pre_gen_pte" ,
4262+ help = "Run the pre-generated pte in the given directory." ,
4263+ type = str ,
4264+ )
4265+
4266+ parser .add_argument (
4267+ "--llama_artifacts" ,
4268+ help = "A folder that contains: weight, tokenizer, and params." ,
4269+ type = str ,
4270+ )
4271+
41594272 args , ns_args = parser .parse_known_args (namespace = unittest )
41604273 TestQNN .host = args .host
41614274 TestQNN .device = args .device
@@ -4174,6 +4287,8 @@ def setup_environment():
41744287 TestQNN .enable_x86_64 = args .enable_x86_64
41754288 TestQNN .dump_intermediate_outputs = args .dump_intermediate_outputs
41764289 TestQNN .compile_only = args .compile_only
4290+ TestQNN .pre_gen_pte = args .pre_gen_pte
4291+ TestQNN .llama_artifacts = args .llama_artifacts
41774292
41784293 return sys .argv [:1 ] + ns_args
41794294
0 commit comments