Skip to content

Commit 8a69060

Browse files
committed
Update base for Update on "[llm] Add a generic text only LLM runner"
Introducing `text_llm_runner`. This can be used to run all text only decoder only LLM models supported by ExecuTorch. * Metadata is being read out from the .pte file and being used to construct the runner object. * examples/models/llama/runner.h[.cpp] only contains a simple wrapper around `text_llm_runner.h[.cpp]`. In next PRs I will move examples/models/phi-3-mini/runner to use the generic runner. Will look into QNN and MediaTek runners as well. Differential Revision: [D75910889](https://our.internmc.facebook.com/intern/diff/D75910889/) [ghstack-poisoned]
2 parents a05c2e9 + b2c02fe commit 8a69060

File tree

121 files changed

+4793
-681
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

121 files changed

+4793
-681
lines changed

.ci/scripts/analyze_benchmark_stability.py

Lines changed: 1523 additions & 0 deletions
Large diffs are not rendered by default.

.github/workflows/pull.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -371,7 +371,7 @@ jobs:
371371
size=${arr[4]}
372372
# threshold=48120 on devserver with gcc11.4
373373
# todo(lfq): update once binary size is below 50kb.
374-
threshold="51408"
374+
threshold="55584"
375375
if [[ "$size" -le "$threshold" ]]; then
376376
echo "Success $size <= $threshold"
377377
else
@@ -406,7 +406,7 @@ jobs:
406406
output=$(ls -la cmake-out/test/size_test)
407407
arr=($output)
408408
size=${arr[4]}
409-
threshold="47560"
409+
threshold="51728"
410410
if [[ "$size" -le "$threshold" ]]; then
411411
echo "Success $size <= $threshold"
412412
else

backends/arm/test/tester/analyze_output_utils.py

Lines changed: 16 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -154,6 +154,13 @@ def print_error_diffs(
154154
output_str += f"BATCH {n}\n"
155155
result_batch = result[n, :, :, :]
156156
reference_batch = reference[n, :, :, :]
157+
158+
if reference_batch.dtype == torch.bool or result_batch.dtype == torch.bool:
159+
mismatches = (reference_batch != result_batch).sum().item()
160+
total = reference_batch.numel()
161+
output_str += f"(BOOLEAN tensor) {mismatches} / {total} elements differ ({mismatches / total:.2%})\n"
162+
continue
163+
157164
is_close = torch.allclose(result_batch, reference_batch, rtol, atol)
158165
if is_close:
159166
output_str += ".\n"
@@ -180,14 +187,15 @@ def print_error_diffs(
180187
output_str += _print_elements(
181188
result[n, :, :, :], reference[n, :, :, :], C, H, W, rtol, atol
182189
)
183-
184-
reference_range = torch.max(reference) - torch.min(reference)
185-
diff = torch.abs(reference - result).flatten()
186-
diff = diff[diff.nonzero()]
187-
if not len(diff) == 0:
188-
diff_percent = diff / reference_range
189-
output_str += "\nMEAN MEDIAN MAX MIN (error as % of reference output range)\n"
190-
output_str += f"{torch.mean(diff_percent):<8.2%} {torch.median(diff_percent):<8.2%} {torch.max(diff_percent):<8.2%} {torch.min(diff_percent):<8.2%}\n"
190+
# Only compute numeric error metrics if tensor is not boolean
191+
if reference.dtype != torch.bool and result.dtype != torch.bool:
192+
reference_range = torch.max(reference) - torch.min(reference)
193+
diff = torch.abs(reference - result).flatten()
194+
diff = diff[diff.nonzero()]
195+
if not len(diff) == 0:
196+
diff_percent = diff / reference_range
197+
output_str += "\nMEAN MEDIAN MAX MIN (error as % of reference output range)\n"
198+
output_str += f"{torch.mean(diff_percent):<8.2%} {torch.median(diff_percent):<8.2%} {torch.max(diff_percent):<8.2%} {torch.min(diff_percent):<8.2%}\n"
191199

192200
# Over-engineer separators to match output width
193201
lines = output_str.split("\n")

backends/cadence/aot/TARGETS

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -276,6 +276,24 @@ python_library(
276276
],
277277
)
278278

279+
python_library(
280+
name = "decompose_ops",
281+
srcs = [
282+
"decompose_ops.py",
283+
],
284+
typing = True,
285+
deps = [
286+
":pass_utils",
287+
"//caffe2:torch",
288+
"//executorch/backends/cadence/aot:pass_utils",
289+
"//executorch/exir:pass_base",
290+
"//executorch/exir/dialects:lib",
291+
"//executorch/exir/dialects/edge:lib",
292+
"//executorch/exir/passes:spec_prop_pass",
293+
],
294+
)
295+
296+
279297
python_unittest(
280298
name = "test_graph_builder",
281299
srcs = [
@@ -314,6 +332,27 @@ python_unittest(
314332
],
315333
)
316334

335+
python_unittest(
336+
name = "test_decompose_ops_passes",
337+
srcs = [
338+
"tests/test_decompose_ops_passes.py",
339+
],
340+
supports_static_listing = False,
341+
typing = True,
342+
deps = [
343+
"fbsource//third-party/pypi/parameterized:parameterized",
344+
":compiler",
345+
":decompose_ops",
346+
"//caffe2:torch",
347+
"//executorch/backends/cadence/aot:compiler",
348+
"//executorch/backends/cadence/aot:graph_builder",
349+
"//executorch/backends/cadence/aot:pass_utils",
350+
"//executorch/exir:pass_base",
351+
"//executorch/exir/dialects:lib",
352+
"//executorch/exir/passes:lib",
353+
],
354+
)
355+
317356
python_unittest(
318357
name = "test_fusion_ops_passes",
319358
srcs = [
Lines changed: 122 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,122 @@
1+
# Copyright (c) Meta Platforms, Inc. and affiliates.
2+
# All rights reserved.
3+
# Copyright 2025 Arm Limited and/or its affiliates.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
9+
# This file contains all the functions that decompose one op into simpler ops in the
10+
# graph. The functions decomposing ops for models deployed with Jarvis are grouped
11+
# together in class 'DecomposeOpsInGraph'. Some examples of functions in the class are
12+
# 1. functions that decompose an ATen gelu op into an equivalent series of simpler ops
13+
14+
# pyre-strict
15+
16+
from typing import Dict
17+
18+
from executorch.backends.cadence.aot.pass_utils import (
19+
CadencePassAttribute,
20+
register_cadence_pass,
21+
)
22+
from executorch.exir.dialects._ops import ops as exir_ops
23+
from executorch.exir.dialects.edge._ops import EdgeOpOverload
24+
from executorch.exir.pass_base import ExportPass, NodeMetadata, ProxyValue
25+
from torch.fx.node import Argument
26+
27+
28+
@register_cadence_pass(CadencePassAttribute(opt_level=0))
29+
class DecomposeAtenApproxGeluPass(ExportPass):
30+
"""
31+
Decompose the aten gelu op with an approximate arg to a series of simpler ops
32+
"""
33+
34+
def call_operator(
35+
self,
36+
op: EdgeOpOverload,
37+
args: tuple[Argument, ...],
38+
kwargs: Dict[str, Argument],
39+
meta: NodeMetadata,
40+
) -> ProxyValue:
41+
# compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
42+
# as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
43+
44+
# Get 0.5 * x
45+
half = super().call_operator(
46+
exir_ops.edge.aten.mul.Tensor,
47+
(args[0], 0.5),
48+
{},
49+
meta,
50+
)
51+
52+
scaled = super().call_operator(
53+
exir_ops.edge.aten.mul.Tensor,
54+
(args[0], 0.044715),
55+
{},
56+
meta,
57+
)
58+
59+
# Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
60+
# it is much more efficient on DSP backends)
61+
scaled_square = super().call_operator(
62+
exir_ops.edge.aten.mul.Tensor,
63+
(scaled, args[0]),
64+
{},
65+
meta,
66+
)
67+
68+
# Get x^3
69+
scaled_cubed = super().call_operator(
70+
exir_ops.edge.aten.mul.Tensor,
71+
(scaled_square, args[0]),
72+
{},
73+
meta,
74+
)
75+
76+
# Get x + 0.044715 * x^3
77+
inner_sum = super().call_operator(
78+
exir_ops.edge.aten.add.Tensor,
79+
(scaled_cubed, args[0]),
80+
{},
81+
meta,
82+
)
83+
84+
# Get 0.7978845608028654 * ( x + 0.044715 * x^3)
85+
scaled_sum = super().call_operator(
86+
exir_ops.edge.aten.mul.Tensor,
87+
(inner_sum, 0.7978845608028654),
88+
{},
89+
meta,
90+
)
91+
92+
# Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
93+
tanh = super().call_operator(
94+
exir_ops.edge.aten.tanh.default,
95+
(scaled_sum,),
96+
{},
97+
meta,
98+
)
99+
100+
# Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
101+
# TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
102+
outer_sum = super().call_operator(
103+
exir_ops.edge.aten.add.Tensor,
104+
(tanh, 1.0),
105+
{},
106+
meta,
107+
)
108+
109+
# Return the final result
110+
return super().call_operator(
111+
exir_ops.edge.aten.mul.Tensor,
112+
(half, outer_sum),
113+
{},
114+
meta,
115+
)
116+
117+
118+
# This class encapsulates all the functions that decompose one op in the graph.
119+
class CadenceDecomposeOpsInGraph:
120+
passes = [
121+
DecomposeAtenApproxGeluPass,
122+
]

backends/cadence/aot/replace_ops.py

Lines changed: 1 addition & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -2078,89 +2078,11 @@ def call_operator(
20782078
kwargs: Dict[str, Argument],
20792079
meta: NodeMetadata,
20802080
) -> ProxyValue:
2081-
if "approximate" not in kwargs:
2082-
return super().call_operator(op, args, kwargs, meta)
2083-
20842081
if op not in {
20852082
exir_ops.edge.aten.gelu.default,
20862083
}:
20872084
return super().call_operator(op, args, kwargs, meta)
2088-
2089-
# compute the approximate gelu (0.7978845608028654 is sqrt(2 / pi))
2090-
# as 0.5 * x * (1 + torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3)))
2091-
2092-
# Get 0.5 * x
2093-
half = super().call_operator(
2094-
exir_ops.edge.aten.mul.Tensor,
2095-
(args[0], 0.5),
2096-
{},
2097-
meta,
2098-
)
2099-
2100-
scaled = super().call_operator(
2101-
exir_ops.edge.aten.mul.Tensor,
2102-
(args[0], 0.044715),
2103-
{},
2104-
meta,
2105-
)
2106-
2107-
# Get x^2 (note that we use mul.Tensor twice instead of pow.Tensor because
2108-
# it is much more efficient on DSP backends)
2109-
scaled_square = super().call_operator(
2110-
exir_ops.edge.aten.mul.Tensor,
2111-
(scaled, args[0]),
2112-
{},
2113-
meta,
2114-
)
2115-
2116-
# Get x^3
2117-
scaled_cubed = super().call_operator(
2118-
exir_ops.edge.aten.mul.Tensor,
2119-
(scaled_square, args[0]),
2120-
{},
2121-
meta,
2122-
)
2123-
2124-
# Get x + 0.044715 * x^3
2125-
inner_sum = super().call_operator(
2126-
exir_ops.edge.aten.add.Tensor,
2127-
(scaled_cubed, args[0]),
2128-
{},
2129-
meta,
2130-
)
2131-
2132-
# Get 0.7978845608028654 * ( x + 0.044715 * x^3)
2133-
scaled_sum = super().call_operator(
2134-
exir_ops.edge.aten.mul.Tensor,
2135-
(inner_sum, 0.7978845608028654),
2136-
{},
2137-
meta,
2138-
)
2139-
2140-
# Get torch.tanh(0.7978845608028654 * ( x + 0.044715 * x^3))
2141-
tanh = super().call_operator(
2142-
exir_ops.edge.aten.tanh.default,
2143-
(scaled_sum,),
2144-
{},
2145-
meta,
2146-
)
2147-
2148-
# Get 1 + torch.tanh(0.79788456 * ( x + 0.044715 * x^3))
2149-
# TODO(): Check why this is not working properly with integer values (e.g. 1 instead of 1.)
2150-
outer_sum = super().call_operator(
2151-
exir_ops.edge.aten.add.Tensor,
2152-
(tanh, 1.0),
2153-
{},
2154-
meta,
2155-
)
2156-
2157-
# Retunr the final result
2158-
return super().call_operator(
2159-
exir_ops.edge.aten.mul.Tensor,
2160-
(half, outer_sum),
2161-
{},
2162-
meta,
2163-
)
2085+
return super().call_operator(op, args, kwargs, meta)
21642086

21652087

21662088
# Adapted from fbcode/pyspeech/opt_passes/replace_ops.py

0 commit comments

Comments
 (0)