Skip to content

Commit bc99fb3

Browse files
authored
Merge branch 'main' into export-D82581442
2 parents 726e809 + b1a41e7 commit bc99fb3

File tree

8 files changed

+228
-10
lines changed

8 files changed

+228
-10
lines changed

.ci/scripts/test_llava.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -149,7 +149,7 @@ run_and_verify() {
149149

150150
# verify result.txt
151151
RESULT=$(cat result.txt)
152-
EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with"
152+
EXPECTED_PREFIX="ASSISTANT: The image captures a basketball game in progress, with"
153153

154154
if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
155155
echo "Expected result prefix: ${EXPECTED_PREFIX}"

backends/cadence/aot/ops_registrations.py

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -324,6 +324,19 @@
324324
"rope.out(Tensor input, Tensor sin_tensor, Tensor cos_tensor, Tensor? pos, *, Tensor(a!) out) -> Tensor(a!)"
325325
)
326326

327+
lib.define(
328+
"quantized_softmax(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point) -> (Tensor out)"
329+
)
330+
lib.define(
331+
"quantized_softmax.per_tensor(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point) -> (Tensor out)"
332+
)
333+
lib.define(
334+
"quantized_softmax.out(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
335+
)
336+
lib.define(
337+
"quantized_softmax.per_tensor_out(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
338+
)
339+
327340
# Load/store with iDMA. These only exist before memory planning.
328341
# Post memory planning, we check that outputs/inputs for the load/store are in
329342
# DTCM and replace idma_load/idma_store with idma_copy.
@@ -2329,3 +2342,29 @@ def softmax_f32_f32_meta(
23292342
half_to_float: Optional[bool] = None,
23302343
) -> torch.Tensor:
23312344
return self.new_empty(self.size(), dtype=self.dtype)
2345+
2346+
2347+
@register_fake("cadence::quantized_softmax")
2348+
def quantized_softmax_meta(
2349+
input: torch.Tensor,
2350+
mask: torch.Tensor,
2351+
dim: int,
2352+
in_scale: torch.Tensor,
2353+
in_zero_point: torch.Tensor,
2354+
out_scale: torch.Tensor,
2355+
out_zero_point: torch.Tensor,
2356+
) -> torch.Tensor:
2357+
return input.new_empty(input.size(), dtype=input.dtype)
2358+
2359+
2360+
@register_fake("cadence::quantized_softmax.per_tensor")
2361+
def quantized_softmax_per_tensor_meta(
2362+
input: torch.Tensor,
2363+
mask: torch.Tensor,
2364+
dim: int,
2365+
in_scale: float,
2366+
in_zero_point: int,
2367+
out_scale: float,
2368+
out_zero_point: int,
2369+
) -> torch.Tensor:
2370+
return input.new_empty(input.size(), dtype=input.dtype)

backends/cadence/aot/quantizer/fusion_pass.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,9 +6,10 @@
66

77
# pyre-strict
88

9-
from typing import Any, Dict, List, Tuple
9+
from typing import Any, cast, Dict, List, Tuple
1010

1111
import torch
12+
from executorch.backends.cadence.aot.compiler_utils import get_shape
1213
from executorch.backends.cadence.aot.quantizer.patterns import (
1314
AddmmPattern,
1415
AddPattern,
@@ -25,6 +26,7 @@
2526
MatmulPattern,
2627
ReluPattern0,
2728
ReluPattern1,
29+
SoftmaxPattern,
2830
)
2931
from executorch.backends.cadence.aot.quantizer.utils import (
3032
check_out_zero_point_is_min_range,
@@ -388,6 +390,73 @@ def get_args_and_kwargs_relu(
388390
return args, kwargs
389391

390392

393+
def get_args_and_kwargs_softmax(
394+
graph_module: GraphModule,
395+
inputs_inputs: List[fx.Node],
396+
dequants_inputs: List[fx.Node],
397+
quant_node: fx.Node,
398+
op_node: fx.Node,
399+
) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
400+
# Make a dummy mask tensor
401+
mask_shape = get_shape(graph_module, cast(fx.Node, quant_node.args[0]))
402+
mask_shape = list(mask_shape) if mask_shape else []
403+
mask_shape[-1] = mask_shape[-1] // 16
404+
mask_tensor = graph_module.graph.call_function(
405+
torch.ops.aten.full.default,
406+
(
407+
mask_shape,
408+
0.0,
409+
),
410+
{"dtype": torch.int32},
411+
)
412+
# Make the scale and zero_point tensors
413+
in_scale_tensor = graph_module.graph.call_function(
414+
torch.ops.aten.full.default,
415+
(
416+
[1],
417+
dequants_inputs[0].args[1],
418+
),
419+
{"dtype": torch.float32},
420+
)
421+
in_zero_point_tensor = graph_module.graph.call_function(
422+
torch.ops.aten.full.default,
423+
(
424+
[1],
425+
dequants_inputs[0].args[2],
426+
),
427+
{"dtype": torch.int32},
428+
)
429+
out_scale_tensor = graph_module.graph.call_function(
430+
torch.ops.aten.full.default,
431+
(
432+
[1],
433+
quant_node.args[1],
434+
),
435+
{"dtype": torch.float32},
436+
)
437+
out_zero_point_tensor = graph_module.graph.call_function(
438+
torch.ops.aten.full.default,
439+
(
440+
[1],
441+
quant_node.args[2],
442+
),
443+
{"dtype": torch.int32},
444+
)
445+
446+
# Make the args and kwargs for the replacement op
447+
args = (
448+
inputs_inputs[0],
449+
mask_tensor,
450+
op_node.args[1],
451+
in_scale_tensor,
452+
in_zero_point_tensor,
453+
out_scale_tensor,
454+
out_zero_point_tensor,
455+
)
456+
kwargs = {}
457+
return args, kwargs
458+
459+
391460
class QuantFusion(ExportPass):
392461
# pyre-ignore[2]: Parameter `patterns` has no type specified
393462
def __init__(self, patterns) -> None:
@@ -543,6 +612,14 @@ def call(self, graph_module: fx.GraphModule) -> PassResult: # noqa: C901
543612
dequants_inputs,
544613
quant_node,
545614
)
615+
elif isinstance(pattern, SoftmaxPattern):
616+
args, kwargs = get_args_and_kwargs_softmax(
617+
graph_module,
618+
inputs_inputs,
619+
dequants_inputs,
620+
quant_node,
621+
anchor_output_node,
622+
)
546623
fused = graph_module.graph.call_function(
547624
pattern.replacement_op(),
548625
args,

backends/cadence/aot/quantizer/patterns.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -485,3 +485,25 @@ def partition_types(self) -> List[OpOverload]:
485485
class Conv2dReluPattern1(ConvReluBasePattern):
486486
def partition_types(self) -> List[OpOverload]:
487487
return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default]
488+
489+
490+
class SoftmaxPattern(QuantizationPattern):
491+
492+
def partition_types(self) -> List[OpOverload]:
493+
return [torch.ops.aten._softmax.default]
494+
495+
def get_anchors(
496+
self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
497+
) -> PartitionAnchors:
498+
# pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
499+
softmax_node = fused_partition[0].nodes[-1]
500+
501+
return PartitionAnchors(
502+
inputs=[(softmax_node, 0)],
503+
weights=[],
504+
biases=[],
505+
output=[(softmax_node,)],
506+
)
507+
508+
def replacement_op(self) -> OpOverload:
509+
return torch.ops.cadence.quantized_softmax.default

backends/cadence/aot/quantizer/quantizer.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@
2727
QuantizationPattern,
2828
ReluPattern0,
2929
ReluPattern1,
30+
SoftmaxPattern,
3031
)
3132
from executorch.backends.cadence.aot.quantizer.utils import (
3233
find_sequential_partitions_aten,
@@ -58,6 +59,15 @@
5859
observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
5960
)
6061

62+
act_qspec_asym16s = QuantizationSpec(
63+
dtype=torch.int16,
64+
quant_min=-32768,
65+
quant_max=32767,
66+
qscheme=torch.per_tensor_affine,
67+
is_dynamic=False,
68+
observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
69+
)
70+
6171
wgt_qspec_asym8s = QuantizationSpec(
6272
dtype=torch.int8,
6373
quant_min=-128,
@@ -92,6 +102,13 @@
92102
None,
93103
)
94104

105+
qconfig_A16 = QuantizationConfig(
106+
act_qspec_asym16s,
107+
act_qspec_asym16s,
108+
wgt_qspec_asym8s,
109+
None,
110+
)
111+
95112

96113
class CadenceAtenQuantizer(Quantizer):
97114
def __init__(
@@ -283,3 +300,15 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
283300
quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8))
284301
quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8))
285302
super().__init__(quantizers)
303+
304+
305+
class CadenceWithSoftmaxQuantizer(CadenceQuantizer):
306+
"""
307+
Quantizer including A16 softmax
308+
"""
309+
310+
def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
311+
if quantizers is None:
312+
quantizers = get_cadence_default_quantizers()
313+
quantizers.append(CadenceAtenQuantizer(SoftmaxPattern(), qconfig_A16))
314+
super().__init__(quantizers)

examples/models/llava/main.cpp

Lines changed: 53 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -6,8 +6,11 @@
66
* LICENSE file in the root directory of this source tree.
77
*/
88

9-
#include <executorch/examples/models/llava/runner/llava_runner.h>
9+
#include <executorch/extension/llm/runner/image.h>
10+
#include <executorch/extension/llm/runner/multimodal_input.h>
11+
#include <executorch/extension/llm/runner/multimodal_runner.h>
1012
#include <gflags/gflags.h>
13+
#include <pytorch/tokenizers/llama2c_tokenizer.h>
1114
#define STB_IMAGE_IMPLEMENTATION
1215
#include <stb_image.h>
1316
#define STB_IMAGE_RESIZE_IMPLEMENTATION
@@ -44,7 +47,10 @@ DEFINE_int32(
4447
-1,
4548
"Number of CPU threads for inference. Defaults to -1, which implies we'll use a heuristic to derive the # of performant cores for a specific device.");
4649

47-
using executorch::extension::llm::Image;
50+
using ::executorch::extension::llm::Image;
51+
using ::executorch::extension::llm::make_image_input;
52+
using ::executorch::extension::llm::make_text_input;
53+
using ::executorch::extension::llm::MultimodalInput;
4854

4955
void load_image(const std::string& image_path, Image& image) {
5056
int width, height, channels;
@@ -127,14 +133,54 @@ int32_t main(int32_t argc, char** argv) {
127133
->_unsafe_reset_threadpool(num_performant_cores);
128134
}
129135
#endif
130-
// create llama runner
131-
example::LlavaRunner runner(model_path, tokenizer_path, temperature);
136+
// Load tokenizer
137+
std::unique_ptr<::tokenizers::Tokenizer> tokenizer =
138+
std::make_unique<tokenizers::Llama2cTokenizer>();
139+
tokenizer->load(tokenizer_path);
140+
if (tokenizer == nullptr) {
141+
ET_LOG(Error, "Failed to load tokenizer from: %s", tokenizer_path);
142+
return 1;
143+
}
144+
145+
// Create multimodal runner
146+
std::unique_ptr<::executorch::extension::llm::MultimodalRunner> runner =
147+
::executorch::extension::llm::create_multimodal_runner(
148+
model_path, std::move(tokenizer));
149+
if (runner == nullptr) {
150+
ET_LOG(Error, "Failed to create multimodal runner");
151+
return 1;
152+
}
132153

154+
// Load runner
155+
auto load_error = runner->load();
156+
if (load_error != ::executorch::runtime::Error::Ok) {
157+
ET_LOG(Error, "Failed to load multimodal runner");
158+
return 1;
159+
}
160+
161+
// Prepare inputs
162+
static const char* kPresetPrompt =
163+
"A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions. USER: ";
133164
Image image;
134165
load_image(image_path, image);
135-
std::vector<Image> images = {image};
166+
std::vector<MultimodalInput> inputs = {
167+
make_text_input(std::string(kPresetPrompt)),
168+
make_image_input(image),
169+
make_text_input(std::string(prompt)),
170+
};
171+
172+
::executorch::extension::llm::GenerationConfig config;
173+
config.temperature = temperature;
174+
config.echo = true;
175+
176+
// Generate
177+
ET_LOG(Info, "Starting generation...");
178+
auto error = runner->generate(inputs, config);
179+
if (error != ::executorch::runtime::Error::Ok) {
180+
ET_LOG(Error, "Failed to generate with multimodal runner");
181+
return 1;
182+
}
136183

137-
// generate
138-
runner.generate(std::move(images), prompt, seq_len);
184+
printf("\n");
139185
return 0;
140186
}

extension/llm/runner/multimodal_runner.cpp

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -104,7 +104,11 @@ Error MultimodalRunner::generate(
104104

105105
uint64_t prefill_next_token = 0;
106106
// Process multimodal inputs in order
107-
for (const MultimodalInput& input : inputs) {
107+
for (size_t i = 0; i < inputs.size(); ++i) {
108+
const MultimodalInput& input = inputs[i];
109+
if (config.echo && i == inputs.size() - 1 && input.is_text()) {
110+
wrapped_callback(input.get_text());
111+
}
108112
prefill_next_token = ET_UNWRAP(multimodal_prefiller_->prefill(input, pos_));
109113
}
110114

extension/llm/runner/targets.bzl

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ def define_common_targets():
3636
exported_deps = [
3737
":constants",
3838
"//executorch/extension/module:module" + aten_suffix,
39+
"//executorch/extension/tensor:tensor" + aten_suffix,
3940
],
4041
)
4142

0 commit comments

Comments
 (0)