Skip to content

Commit 208989b

Browse files
committed
Update on "[llm] Add a generic text only LLM runner"
Introducing `text_llm_runner`. This can be used to run all text only decoder only LLM models supported by ExecuTorch. * Metadata is being read out from the .pte file and being used to construct the runner object. * examples/models/llama/runner.h[.cpp] only contains a simple wrapper around `text_llm_runner.h[.cpp]`. In next PRs I will move examples/models/phi-3-mini/runner to use the generic runner. Will look into QNN and MediaTek runners as well. Differential Revision: [D75910889](https://our.internmc.facebook.com/intern/diff/D75910889/) [ghstack-poisoned]
2 parents fca9d38 + d0e7faa commit 208989b

File tree

46 files changed

+937
-281
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

46 files changed

+937
-281
lines changed
Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
01f1cc44cbbfdf6307aa01b803a4ee22f9ade946
1+
5616fa4a68718ead203314a3467f7dd9547153ae

backends/arm/quantizer/quantization_annotator.py

Lines changed: 35 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,10 @@ def _is_ok_for_quantization(
9595
continue
9696

9797
for n_arg in _as_list(node.args[quant_property.index]):
98-
assert isinstance(n_arg, Node)
98+
if not isinstance(n_arg, Node):
99+
raise TypeError(
100+
f"n_arg must be a Node instance, got {type(n_arg).__name__!r}"
101+
)
99102
if not is_ok_for_quantization(n_arg, gm): # type: ignore[attr-defined]
100103
logger.debug(
101104
f'could not quantize node due to input "{node}": '
@@ -108,7 +111,10 @@ def _is_ok_for_quantization(
108111

109112

110113
def _annotate_input(node: Node, quant_property: _QuantProperty):
111-
assert not is_annotated(node)
114+
if is_annotated(node):
115+
raise RuntimeError(
116+
f"Cannot annotate input: node '{node.name}' is already annotated"
117+
)
112118
if quant_property.optional and (
113119
quant_property.index >= len(node.args)
114120
or node.args[quant_property.index] is None
@@ -120,17 +126,28 @@ def _annotate_input(node: Node, quant_property: _QuantProperty):
120126
_as_list(quant_property.qspec),
121127
strict=True,
122128
):
123-
assert isinstance(n_arg, Node)
129+
if not isinstance(n_arg, Node):
130+
raise TypeError(
131+
f"n_arg must be a Node instance, got {type(n_arg).__name__!r}"
132+
)
124133
annotate_input_qspec_map(node, n_arg, qspec)
125134
if quant_property.mark_annotated:
126135
mark_node_as_annotated(n_arg) # type: ignore[attr-defined]
127136

128137

129138
def _annotate_output(node: Node, quant_property: _QuantProperty):
130-
assert not is_annotated(node)
131-
assert not quant_property.mark_annotated
132-
assert not quant_property.optional
133-
assert quant_property.index == 0, "Only one output annotation supported currently"
139+
if is_annotated(node):
140+
raise RuntimeError(
141+
f"Cannot annotate output: node '{node.name}' is already annotated"
142+
)
143+
if quant_property.mark_annotated:
144+
raise ValueError(
145+
"quant_property.mark_annotated must be False for output annotation"
146+
)
147+
if quant_property.optional:
148+
raise ValueError("quant_property.optional must be False for output annotation")
149+
if quant_property.index != 0:
150+
raise ValueError("Only one output annotation supported currently")
134151

135152
annotate_output_qspec(node, quant_property.qspec)
136153

@@ -145,7 +162,9 @@ def _match_pattern(
145162
146163
Each 'pattern' element is composed of a list of disjunctive nodes types.
147164
"""
148-
assert len(pattern) > 0, "No pattern provided"
165+
if len(pattern) < 1:
166+
raise ValueError("No pattern provided")
167+
149168
if filter_fn is not None:
150169
if not filter_fn(node):
151170
return False
@@ -417,8 +436,14 @@ def any_or_hardtanh_min_zero(n: Node):
417436
torch.ops.aten.concatenate.default,
418437
torch.ops.aten.stack.default,
419438
):
420-
assert isinstance(node.args[0], list)
421-
assert len(node.args[0]) != 0
439+
# first argument should be a non-empty list of nodes
440+
if not isinstance(node.args[0], list):
441+
raise TypeError(
442+
"Expected node.args[0] to be a list, got "
443+
f"{type(node.args[0]).__name__!r}"
444+
)
445+
if len(node.args[0]) == 0:
446+
raise ValueError("Expected non-empty list for node.args[0]")
422447

423448
shared_qspec = SharedQuantizationSpec((node.args[0][0], node))
424449
quant_properties.quant_inputs = [

backends/arm/scripts/build_executorch.sh

Lines changed: 1 addition & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -54,47 +54,9 @@ source ${setup_path_script}
5454

5555
et_build_dir="${et_build_root}/cmake-out"
5656

57-
# Used for flatcc host excutable if Devtools is used
58-
et_build_host_dir=${et_build_root}/cmake-out-host-tools
59-
6057
set -x
6158
cd "${et_root_dir}"
6259

63-
if [ "$build_with_etdump" = true ] ; then
64-
( set +x ;
65-
echo "--------------------------------------------------------------------------------" ;
66-
echo "Build ExecuTorch Libraries host flatcc bin ${build_type} into ${et_build_host_dir}/bin/flatcc" ;
67-
echo "--------------------------------------------------------------------------------" )
68-
69-
# Build host flatcc bin
70-
# This is a way to work around that the flatcc executable get build for target (e.g. Arm) later
71-
# and get replaced. flatcc is a tool used on the host for etdump and BundleIO handling.
72-
# The way to solve this is to generate it once for the host, then copy it to ${et_build_host_dir}/bin
73-
# and later point that out with -DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc later.
74-
75-
cmake \
76-
-DCMAKE_INSTALL_PREFIX=${et_build_host_dir} \
77-
-DCMAKE_BUILD_TYPE=${build_type} \
78-
-DEXECUTORCH_BUILD_EXECUTOR_RUNNER=OFF \
79-
-DEXECUTORCH_ENABLE_LOGGING=ON \
80-
-DEXECUTORCH_BUILD_ARM_BAREMETAL=ON \
81-
-DEXECUTORCH_BUILD_KERNELS_QUANTIZED=ON \
82-
-DEXECUTORCH_BUILD_EXTENSION_RUNNER_UTIL=ON \
83-
-DEXECUTORCH_BUILD_DEVTOOLS=ON \
84-
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
85-
-DFLATCC_ALLOW_WERROR=OFF \
86-
-B"${et_build_host_dir}" \
87-
"${et_root_dir}"
88-
89-
# third-party/flatcc/bin/flatcc gets build already in the in the cmake config step above
90-
# so there is no cmake building step done
91-
92-
# Copy host flatcc excutable so it's saved when we build for target (Arm) later
93-
et_build_host_dir=$(realpath ${et_build_host_dir})
94-
mkdir -p ${et_build_host_dir}/bin
95-
cp third-party/flatcc/bin/flatcc ${et_build_host_dir}/bin
96-
fi
97-
9860
( set +x ;
9961
echo "--------------------------------------------------------------------------------" ;
10062
echo "Build ExecuTorch target libs ${build_type} into '${et_build_dir}'" ;
@@ -111,8 +73,7 @@ if [ "$build_with_etdump" = true ] ; then
11173
build_with_etdump_flags="-DEXECUTORCH_BUILD_DEVTOOLS=ON \
11274
-DEXECUTORCH_ENABLE_EVENT_TRACER=ON \
11375
-DEXECUTORCH_BUILD_EXTENSION_DATA_LOADER=OFF \
114-
-DFLATCC_ALLOW_WERROR=OFF \
115-
-DFLATCC_EXECUTABLE=${et_build_host_dir}/bin/flatcc "
76+
-DFLATCC_ALLOW_WERROR=OFF "
11677
fi
11778

11879
echo "Building with Devtools: ${build_devtools_flags} ${build_with_etdump_flags}"

backends/mediatek/partitioner.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,7 @@ def ops_to_not_decompose(
8181
torch.ops.aten.upsample_bilinear2d.vec,
8282
torch.ops.aten.upsample_nearest2d.default,
8383
torch.ops.aten.upsample_nearest2d.vec,
84+
torch.ops.aten._safe_softmax.default,
8485
]
8586
return (ops_not_decompose, None)
8687

backends/mediatek/scripts/mtk_build.sh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ rm -rf cmake-android-out && mkdir cmake-android-out && cd cmake-android-out
3333
cmake -DBUCK2="$BUCK_PATH" \
3434
-DCMAKE_TOOLCHAIN_FILE="$ANDROID_NDK/build/cmake/android.toolchain.cmake" \
3535
-DANDROID_ABI=arm64-v8a \
36+
-DANDROID_PLATFORM=android-26 \
3637
-DEXECUTORCH_BUILD_NEURON=ON \
3738
-DNEURON_BUFFER_ALLOCATOR_LIB="$NEURON_BUFFER_ALLOCATOR_LIB" \
3839
..

examples/demo-apps/apple_ios/LLaMA/LLaMARunner/LLaMARunner/Exported/LLaMARunner.mm

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
using executorch::extension::llm::GenerationConfig;
1616
using executorch::extension::llm::Image;
17+
using executorch::extension::llm::TextLLMRunner;
1718
using executorch::runtime::Error;
1819

1920
NSErrorDomain const LLaMARunnerErrorDomain = @"LLaMARunnerErrorDomain";
@@ -23,7 +24,7 @@ @interface LLaMARunner ()<ExecuTorchLogSink>
2324
@end
2425

2526
@implementation LLaMARunner {
26-
std::unique_ptr<example::Runner> _runner;
27+
std::unique_ptr<TextLLMRunner> _runner;
2728
}
2829

2930
- (instancetype)initWithModelPath:(NSString*)modelPath

examples/mediatek/aot_utils/oss_utils/utils.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ def build_executorch_binary(
2424
file_name,
2525
dataset,
2626
quant_dtype: Optional[Precision] = None,
27+
skip_op_name: Optional[set] = None,
28+
skip_op_type: Optional[set] = None,
2729
):
2830
if quant_dtype is not None:
2931
quantizer = NeuropilotQuantizer()
@@ -47,14 +49,12 @@ def build_executorch_binary(
4749
from executorch.exir.program._program import to_edge_transform_and_lower
4850

4951
edge_compile_config = exir.EdgeCompileConfig(_check_ir_validity=False)
50-
# skipped op names are used for deeplabV3 model
5152
neuro_partitioner = NeuropilotPartitioner(
5253
[CompileSpec("platform-config", b"mt6989")],
53-
op_names_to_skip={
54-
"aten_convolution_default_106",
55-
"aten_convolution_default_107",
56-
},
54+
op_types_to_skip=skip_op_type,
55+
op_names_to_skip=skip_op_name,
5756
)
57+
5858
edge_prog = to_edge_transform_and_lower(
5959
aten_dialect,
6060
compile_config=edge_compile_config,
Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
# Copyright (c) MediaTek Inc.
2+
# All rights reserved
3+
#
4+
# This source code is licensed under the BSD-style license found in the
5+
# LICENSE file in the root directory of this source tree.
6+
import os
7+
import sys
8+
9+
if os.getcwd() not in sys.path:
10+
sys.path.append(os.getcwd())
11+
import argparse
12+
import os
13+
14+
import dcgan_main
15+
16+
import torch
17+
from aot_utils.oss_utils.utils import build_executorch_binary
18+
from executorch.backends.mediatek import Precision
19+
20+
21+
class NhwcWrappedModel(torch.nn.Module):
22+
def __init__(self, is_gen=True):
23+
super(NhwcWrappedModel, self).__init__()
24+
if is_gen:
25+
self.dcgan = dcgan_main.Generator()
26+
else:
27+
self.dcgan = dcgan_main.Discriminator()
28+
29+
def forward(self, input1):
30+
nchw_input1 = input1.permute(0, 3, 1, 2)
31+
output = self.dcgan(nchw_input1)
32+
return output
33+
34+
35+
if __name__ == "__main__":
36+
parser = argparse.ArgumentParser()
37+
38+
parser.add_argument(
39+
"-a",
40+
"--artifact",
41+
help="path for storing generated artifacts by this example. " "Default ./dcgan",
42+
default="./dcgan",
43+
type=str,
44+
)
45+
46+
args = parser.parse_args()
47+
48+
# ensure the working directory exist.
49+
os.makedirs(args.artifact, exist_ok=True)
50+
51+
# prepare dummy data
52+
inputG = torch.randn(1, 1, 1, 100)
53+
inputD = torch.randn(1, 64, 64, 3)
54+
55+
# build Generator
56+
netG_instance = NhwcWrappedModel(True)
57+
netG_pte_filename = "dcgan_netG_mtk"
58+
build_executorch_binary(
59+
netG_instance.eval(),
60+
(torch.randn(1, 1, 1, 100),),
61+
f"{args.artifact}/{netG_pte_filename}",
62+
[(inputG,)],
63+
quant_dtype=Precision.A8W8,
64+
)
65+
66+
# build Discriminator
67+
netD_instance = NhwcWrappedModel(False)
68+
netD_pte_filename = "dcgan_netD_mtk"
69+
build_executorch_binary(
70+
netD_instance.eval(),
71+
(torch.randn(1, 64, 64, 3),),
72+
f"{args.artifact}/{netD_pte_filename}",
73+
[(inputD,)],
74+
quant_dtype=Precision.A8W8,
75+
)
76+
77+
# save data to inference on device
78+
input_list_file = f"{args.artifact}/input_list_G.txt"
79+
with open(input_list_file, "w") as f:
80+
f.write("inputG_0_0.bin")
81+
f.flush()
82+
file_name = f"{args.artifact}/inputG_0_0.bin"
83+
inputG.detach().numpy().tofile(file_name)
84+
file_name = f"{args.artifact}/goldenG_0_0.bin"
85+
goldenG = netG_instance(inputG)
86+
goldenG.detach().numpy().tofile(file_name)
87+
88+
input_list_file = f"{args.artifact}/input_list_D.txt"
89+
with open(input_list_file, "w") as f:
90+
f.write("inputD_0_0.bin")
91+
f.flush()
92+
file_name = f"{args.artifact}/inputD_0_0.bin"
93+
inputD.detach().numpy().tofile(file_name)
94+
file_name = f"{args.artifact}/goldenD_0_0.bin"
95+
goldenD = netD_instance(inputD)
96+
goldenD.detach().numpy().tofile(file_name)
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
"""Ref https://github.com/pytorch/examples/blob/main/dcgan/main.py"""
2+
3+
import torch.nn as nn
4+
5+
6+
class Generator(nn.Module):
7+
def __init__(self):
8+
super().__init__()
9+
self.main = nn.Sequential(
10+
# input is Z, going into a convolution
11+
nn.ConvTranspose2d(100, 64 * 8, 4, 1, 0, bias=False),
12+
nn.BatchNorm2d(64 * 8),
13+
nn.ReLU(True),
14+
# state size. (64*8) x 4 x 4
15+
nn.ConvTranspose2d(64 * 8, 64 * 4, 4, 2, 1, bias=False),
16+
nn.BatchNorm2d(64 * 4),
17+
nn.ReLU(True),
18+
# state size. (64*4) x 8 x 8
19+
nn.ConvTranspose2d(64 * 4, 64 * 2, 4, 2, 1, bias=False),
20+
nn.BatchNorm2d(64 * 2),
21+
nn.ReLU(True),
22+
# state size. (64*2) x 16 x 16
23+
nn.ConvTranspose2d(64 * 2, 64, 4, 2, 1, bias=False),
24+
nn.BatchNorm2d(64),
25+
nn.ReLU(True),
26+
# state size. (64) x 32 x 32
27+
nn.ConvTranspose2d(64, 3, 4, 2, 1, bias=False),
28+
nn.Tanh(),
29+
# state size. (3) x 64 x 64
30+
)
31+
32+
def forward(self, input):
33+
output = self.main(input)
34+
return output
35+
36+
37+
# main_netG_input_shape = [1, 100, 1, 1]
38+
# model = Generator()
39+
40+
41+
class Discriminator(nn.Module):
42+
def __init__(self):
43+
super().__init__()
44+
self.main = nn.Sequential(
45+
# input is (3) x 64 x 64
46+
nn.Conv2d(3, 64, 4, 2, 1, bias=False),
47+
nn.LeakyReLU(0.2, inplace=True),
48+
# state size. (64) x 32 x 32
49+
nn.Conv2d(64, 64 * 2, 4, 2, 1, bias=False),
50+
nn.BatchNorm2d(64 * 2),
51+
nn.LeakyReLU(0.2, inplace=True),
52+
# state size. (64*2) x 16 x 16
53+
nn.Conv2d(64 * 2, 64 * 4, 4, 2, 1, bias=False),
54+
nn.BatchNorm2d(64 * 4),
55+
nn.LeakyReLU(0.2, inplace=True),
56+
# state size. (64*4) x 8 x 8
57+
nn.Conv2d(64 * 4, 64 * 8, 4, 2, 1, bias=False),
58+
nn.BatchNorm2d(64 * 8),
59+
nn.LeakyReLU(0.2, inplace=True),
60+
# state size. (64*8) x 4 x 4
61+
nn.Conv2d(64 * 8, 1, 4, 1, 0, bias=False),
62+
nn.Sigmoid(),
63+
)
64+
65+
def forward(self, input):
66+
output = self.main(input)
67+
68+
return output.view(-1, 1).squeeze(1)
69+
70+
71+
# main_netD_input_shape = [1, 3, 64, 64]
72+
# model = Discriminator()

0 commit comments

Comments
 (0)