Skip to content

Commit cdf332b

Browse files
committed
Enable inference speed test and 1b test
1 parent 433e30b commit cdf332b

File tree

5 files changed

+204
-68
lines changed

5 files changed

+204
-68
lines changed

.ci/scripts/test_qnn_static_llama.sh

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,11 +34,11 @@ $PYTHON_EXECUTABLE -m extension.llm.tokenizer.tokenizer -t tokenizer.model -o to
3434

3535
set +e
3636
# Compile only as weight sharing is not applicable on x86
37-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --compile_only
37+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-android/ --executorch_root . --artifact_dir . --llama_artifacts . --compile_only
3838
exit_code1=$?
3939

4040
# Checks accuracy with weight sharing disabled since x86 does not support weight sharing.
41-
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleScript.test_stories_single_llama --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --enable_x86_64
41+
$PYTHON_EXECUTABLE backends/qualcomm/tests/test_qnn_delegate.py -k TestExampleLLMScript.test_llama_stories_110m --model SM8650 --build_folder build-x86/ --executorch_root . --artifact_dir . --llama_artifacts . --enable_x86_64
4242
exit_code2=$?
4343

4444
# Check the exit codes and print messages

backends/qualcomm/tests/test_qnn_delegate.py

Lines changed: 181 additions & 66 deletions
Original file line numberDiff line numberDiff line change
@@ -2991,6 +2991,173 @@ def test_qnn_backend_draw_graph(self):
29912991
), "Generated .dot file does not match the golden file."
29922992

29932993

2994+
class TestExampleLLMScript(TestQNN):
2995+
def required_envs(self, conditions=None) -> bool:
2996+
conditions = [] if conditions is None else conditions
2997+
return all(
2998+
[
2999+
self.executorch_root,
3000+
self.artifact_dir,
3001+
*conditions,
3002+
]
3003+
)
3004+
3005+
def test_llama3_2_1b(self):
3006+
if not self.required_envs():
3007+
self.skipTest("missing required envs")
3008+
assert (
3009+
self.llama_artifacts is not None
3010+
), "Please provide path to llama artifacts"
3011+
3012+
prompt = "What is the meaning of life?"
3013+
cmds = [
3014+
"python",
3015+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
3016+
"--artifact",
3017+
self.artifact_dir,
3018+
"--build_folder",
3019+
self.build_folder,
3020+
"--model",
3021+
self.model,
3022+
"--checkpoint",
3023+
f"{self.llama_artifacts}/consolidated.00.pth",
3024+
"--params",
3025+
f"{self.llama_artifacts}/params.json",
3026+
"--tokenizer_model",
3027+
f"{self.llama_artifacts}/tokenizer.model",
3028+
"--ip",
3029+
self.ip,
3030+
"--port",
3031+
str(self.port),
3032+
"--prompt",
3033+
f"{prompt}",
3034+
"--ptq",
3035+
"16a4w",
3036+
"--temperature",
3037+
"0",
3038+
"--llama_model",
3039+
"llama3_2",
3040+
"--model_mode",
3041+
"hybrid",
3042+
"--prefill_seq_len",
3043+
"32",
3044+
"--kv_seq_len",
3045+
"512",
3046+
"--num_sharding",
3047+
"4",
3048+
]
3049+
if self.compile_only:
3050+
cmds.extend(["--compile_only"])
3051+
elif self.device:
3052+
cmds.extend(["--device", self.device])
3053+
if self.host:
3054+
cmds.extend(["--host", self.host])
3055+
elif self.enable_x86_64:
3056+
cmds.extend(["--enable_x86_64"])
3057+
if self.pre_gen_pte:
3058+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
3059+
3060+
golden_start_with = "<|begin_of_text|><|start_header_id|>user<|end_header_id|>"
3061+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
3062+
with Listener((self.ip, self.port)) as listener:
3063+
conn = listener.accept()
3064+
p.communicate()
3065+
msg = json.loads(conn.recv())
3066+
if "Error" in msg:
3067+
self.fail(msg["Error"])
3068+
else:
3069+
if not self.compile_only:
3070+
model_out = msg["result"][0]
3071+
self.assertTrue(
3072+
model_out.startswith(golden_start_with),
3073+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
3074+
)
3075+
# x86 does not allow weight sharing, so we don't check pte size.
3076+
# Inference speed on x86 is slow, so we only check when running on Android
3077+
if not self.enable_x86_64:
3078+
pte_size = msg["pte_size"]
3079+
self.assertLessEqual(pte_size, 1300000000)
3080+
if not self.compile_only and not self.enable_x86_64:
3081+
self.assertGreaterEqual(msg["inference_speed"], 66) # Lanai
3082+
3083+
def test_llama_stories_110m(self):
3084+
if not self.required_envs():
3085+
self.skipTest("missing required envs")
3086+
assert (
3087+
self.llama_artifacts is not None
3088+
), "Please provide path to llama artifacts"
3089+
3090+
prompt = "Once"
3091+
cmds = [
3092+
"python",
3093+
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
3094+
"--artifact",
3095+
self.artifact_dir,
3096+
"--build_folder",
3097+
self.build_folder,
3098+
"--model",
3099+
self.model,
3100+
"--checkpoint",
3101+
f"{self.llama_artifacts}/stories110M.pt",
3102+
"--params",
3103+
f"{self.llama_artifacts}/params.json",
3104+
"--tokenizer_model",
3105+
f"{self.llama_artifacts}/tokenizer.model",
3106+
"--tokenizer_bin",
3107+
f"{self.llama_artifacts}/tokenizer.bin",
3108+
"--ip",
3109+
self.ip,
3110+
"--port",
3111+
str(self.port),
3112+
"--prompt",
3113+
f"{prompt}",
3114+
"--ptq",
3115+
"16a4w",
3116+
"--temperature",
3117+
"0",
3118+
"--llama_model",
3119+
"stories110m",
3120+
"--model_mode",
3121+
"hybrid",
3122+
"--prefill_seq_len",
3123+
"32",
3124+
"--kv_seq_len",
3125+
"128",
3126+
]
3127+
if self.compile_only:
3128+
cmds.extend(["--compile_only"])
3129+
elif self.device:
3130+
cmds.extend(["--device", self.device])
3131+
if self.host:
3132+
cmds.extend(["--host", self.host])
3133+
elif self.enable_x86_64:
3134+
cmds.extend(["--enable_x86_64"])
3135+
if self.pre_gen_pte:
3136+
cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
3137+
3138+
golden_start_with = "Once upon a time,"
3139+
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
3140+
with Listener((self.ip, self.port)) as listener:
3141+
conn = listener.accept()
3142+
p.communicate()
3143+
msg = json.loads(conn.recv())
3144+
if "Error" in msg:
3145+
self.fail(msg["Error"])
3146+
else:
3147+
if not self.compile_only:
3148+
model_out = msg["result"][0]
3149+
self.assertTrue(
3150+
model_out.startswith(golden_start_with),
3151+
f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
3152+
)
3153+
# x86 does not allow weight sharing, so we don't check pte size
3154+
if not self.enable_x86_64:
3155+
pte_size = msg["pte_size"]
3156+
self.assertLessEqual(pte_size, 130000000)
3157+
if not self.compile_only and not self.enable_x86_64:
3158+
self.assertGreaterEqual(msg["inference_speed"], 220) # Lanai
3159+
3160+
29943161
class TestExampleOssScript(TestQNN):
29953162
def required_envs(self, conditions=None) -> bool:
29963163
conditions = [] if conditions is None else conditions
@@ -3886,72 +4053,6 @@ def test_deeplab_v3(self):
38864053
self.assertGreaterEqual(msg["MPA"], 0.70)
38874054
self.assertGreaterEqual(msg["MIoU"], 0.55)
38884055

3889-
def test_stories_single_llama(self):
3890-
if not self.required_envs():
3891-
self.skipTest("missing required envs")
3892-
3893-
cmds = [
3894-
"python",
3895-
f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
3896-
"--artifact",
3897-
self.artifact_dir,
3898-
"--build_folder",
3899-
self.build_folder,
3900-
"--model",
3901-
self.model,
3902-
"--checkpoint",
3903-
f"{self.artifact_dir}/stories110M.pt",
3904-
"--params",
3905-
f"{self.artifact_dir}/params.json",
3906-
"--tokenizer_model",
3907-
f"{self.artifact_dir}/tokenizer.model",
3908-
"--tokenizer_bin",
3909-
f"{self.artifact_dir}/tokenizer.bin",
3910-
"--ip",
3911-
self.ip,
3912-
"--port",
3913-
str(self.port),
3914-
"--prompt",
3915-
"Once",
3916-
"--ptq",
3917-
"16a4w",
3918-
"--temperature",
3919-
"0",
3920-
"--llama_model",
3921-
"stories110m",
3922-
"--model_mode",
3923-
"hybrid",
3924-
"--prefill_seq_len",
3925-
"32",
3926-
"--kv_seq_len",
3927-
"128",
3928-
]
3929-
if self.compile_only:
3930-
cmds.extend(["--compile_only"])
3931-
elif self.device:
3932-
cmds.extend(["--device", self.device])
3933-
if self.host:
3934-
cmds.extend(["--host", self.host])
3935-
elif self.enable_x86_64:
3936-
cmds.extend(["--enable_x86_64"])
3937-
3938-
golden_start_with = "Once upon a time,"
3939-
p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
3940-
with Listener((self.ip, self.port)) as listener:
3941-
conn = listener.accept()
3942-
p.communicate()
3943-
msg = json.loads(conn.recv())
3944-
if "Error" in msg:
3945-
self.fail(msg["Error"])
3946-
else:
3947-
if not self.compile_only:
3948-
model_out = msg["result"][0]
3949-
self.assertTrue(model_out.startswith(golden_start_with))
3950-
# x86 does not allow weight sharing, so we don't check pte size
3951-
if not self.enable_x86_64:
3952-
pte_size = msg["pte_size"]
3953-
self.assertLessEqual(pte_size, 130000000)
3954-
39554056
@unittest.skip("dynamic shape inputs appear in recent torch.export.export")
39564057
def test_mobilebert(self):
39574058
if not self.required_envs([self.pretrained_weight]):
@@ -4156,6 +4257,18 @@ def setup_environment():
41564257
type=str,
41574258
)
41584259

4260+
parser.add_argument(
4261+
"--pre_gen_pte",
4262+
help="Run the pre-generated pte in the given directory.",
4263+
type=str,
4264+
)
4265+
4266+
parser.add_argument(
4267+
"--llama_artifacts",
4268+
help="A folder that contains: weight, tokenizer, and params.",
4269+
type=str,
4270+
)
4271+
41594272
args, ns_args = parser.parse_known_args(namespace=unittest)
41604273
TestQNN.host = args.host
41614274
TestQNN.device = args.device
@@ -4174,6 +4287,8 @@ def setup_environment():
41744287
TestQNN.enable_x86_64 = args.enable_x86_64
41754288
TestQNN.dump_intermediate_outputs = args.dump_intermediate_outputs
41764289
TestQNN.compile_only = args.compile_only
4290+
TestQNN.pre_gen_pte = args.pre_gen_pte
4291+
TestQNN.llama_artifacts = args.llama_artifacts
41774292

41784293
return sys.argv[:1] + ns_args
41794294

backends/qualcomm/tests/utils.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -188,6 +188,8 @@ class TestQNN(unittest.TestCase):
188188
shared_buffer: bool = False
189189
enable_x86_64: bool = False
190190
compile_only: bool = False
191+
pre_gen_pte: str = ""
192+
llama_artifacts: str = ""
191193

192194
def _assert_outputs_equal(self, model_output, ref_output):
193195
self.assertTrue(len(ref_output) == len(model_output))

examples/qualcomm/oss_scripts/llama/llama.py

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -881,13 +881,18 @@ def post_process():
881881

882882
adb.pull(output_path=args.artifact, callback=post_process)
883883
if args.ip and args.port != -1:
884+
inference_speed = 0
885+
with open(f"{args.artifact}/outputs/inference_speed.txt", "r") as f:
886+
inference_speed = float(f.read())
887+
884888
pte_size = os.path.getsize(pte_path)
885889
with Client((args.ip, args.port)) as conn:
886890
conn.send(
887891
json.dumps(
888892
{
889893
"result": outputs,
890894
"pte_size": pte_size,
895+
"inference_speed": inference_speed,
891896
}
892897
)
893898
)

examples/qualcomm/oss_scripts/llama/runner/runner.cpp

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
#include <executorch/runtime/core/exec_aten/util/scalar_type_util.h>
1919
#include <executorch/runtime/platform/log.h>
2020
#include <ctime>
21+
#include <fstream>
2122
#include <sstream>
2223

2324
using executorch::aten::Tensor;
@@ -518,6 +519,19 @@ void printReport(const Runner::Stats& stats) {
518519
stats.num_generated_tokens,
519520
(double)stats.aggregate_sampling_time_ms /
520521
stats.SCALING_FACTOR_UNITS_PER_SECOND);
522+
523+
// For now, we just print the total inference time for CI, can save more info
524+
// in future if needed.
525+
std::ofstream outfile("outputs/inference_speed.txt");
526+
if (outfile.is_open()) {
527+
double num_tok = (stats.num_generated_tokens) /
528+
(double)(stats.inference_end_ms - stats.inference_start_ms) *
529+
stats.SCALING_FACTOR_UNITS_PER_SECOND;
530+
outfile << num_tok;
531+
outfile.close();
532+
} else {
533+
ET_CHECK_MSG(false, "Error saving the inference speed file");
534+
}
521535
}
522536

523537
std::string statsToJsonString(const Runner::Stats& stats) {

0 commit comments

Comments
 (0)