pytorch
diff --git a/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 1 deletion b/‎.ci/scripts/test_llava.sh‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/_link_check.yml‎
Lines changed: 26 additions & 0 deletions b/‎.github/workflows/_link_check.yml‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 39 additions & 0 deletions b/‎backends/cadence/aot/ops_registrations.py‎
Lines changed: 39 additions & 0 deletions
diff --git a/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 78 additions & 1 deletion b/‎backends/cadence/aot/quantizer/fusion_pass.py‎
Lines changed: 78 additions & 1 deletion
diff --git a/‎backends/cadence/aot/quantizer/patterns.py‎
Lines changed: 22 additions & 0 deletions b/‎backends/cadence/aot/quantizer/patterns.py‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎backends/cadence/aot/quantizer/quantizer.py‎
Lines changed: 29 additions & 0 deletions b/‎backends/cadence/aot/quantizer/quantizer.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 1 addition & 1 deletion b/‎backends/cadence/aot/replace_ops.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 10 additions & 1 deletion b/‎examples/models/llama/runner/static_attention_io_manager.h‎
Lines changed: 10 additions & 1 deletion
diff --git a/‎examples/models/llava/CMakeLists.txt‎
Lines changed: 2 additions & 5 deletions b/‎examples/models/llava/CMakeLists.txt‎
Lines changed: 2 additions & 5 deletions
@@ -149,7 +149,7 @@ run_and_verify() {
 
     # verify result.txt
     RESULT=$(cat result.txt)
-    EXPECTED_PREFIX="ASSISTANT: image captures a basketball game in progress, with"
+    EXPECTED_PREFIX="ASSISTANT: The image captures a basketball game in progress, with"
 
     if [[ "${RESULT}" == *"${EXPECTED_PREFIX}"* ]]; then
         echo "Expected result prefix: ${EXPECTED_PREFIX}"
 
@@ -55,3 +55,29 @@ jobs:
           echo "Or add \`@lint-ignore\` somewhere on the same line as the reference you want to skip checking."
           exit 1
         }
+
+  lint-file-size:
+    if: ${{ github.event_name == 'pull_request' }}
+    uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
+    with:
+      runner: linux.2xlarge
+      docker-image: ci-image:executorch-ubuntu-22.04-linter
+      submodules: false
+      fetch-depth: 0
+      ref: ${{ inputs.ref }}
+      timeout: 30
+      script: |
+        chmod +x ./scripts/lint_file_size.sh
+        ./scripts/lint_file_size.sh $(
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            echo "${{ github.event.pull_request.base.sha }}" "${{ github.event.pull_request.head.sha }}"
+          else
+            echo "${{ github.event.before }}" "${{ github.sha }}"
+          fi
+        ) || {
+          echo
+          echo "File size lint failed: some files exceed the 1 MB limit."
+          echo "If you really need large files, consider using Git LFS or storing them elsewhere."
+          echo "If you really need to get unblocked and check in the file, can add it to the EXCEPTIONS list in scripts/lint_file_size.sh."
+          exit 1
+        }
@@ -324,6 +324,19 @@
     "rope.out(Tensor input, Tensor sin_tensor, Tensor cos_tensor, Tensor? pos, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
+lib.define(
+    "quantized_softmax(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point) -> (Tensor out)"
+)
+lib.define(
+    "quantized_softmax.per_tensor(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point) -> (Tensor out)"
+)
+lib.define(
+    "quantized_softmax.out(Tensor input, Tensor mask, int dim, Tensor in_scale, Tensor in_zero_point, Tensor out_scale, Tensor out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
+lib.define(
+    "quantized_softmax.per_tensor_out(Tensor input, Tensor mask, int dim, float in_scale, int in_zero_point, float out_scale, int out_zero_point, *, Tensor(a!) out) -> Tensor (a!)"
+)
+
 # Load/store with iDMA. These only exist before memory planning.
 # Post memory planning, we check that outputs/inputs for the load/store are in
 # DTCM and replace idma_load/idma_store with idma_copy.
@@ -2329,3 +2342,29 @@ def softmax_f32_f32_meta(
     half_to_float: Optional[bool] = None,
 ) -> torch.Tensor:
     return self.new_empty(self.size(), dtype=self.dtype)
+
+
+@register_fake("cadence::quantized_softmax")
+def quantized_softmax_meta(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    dim: int,
+    in_scale: torch.Tensor,
+    in_zero_point: torch.Tensor,
+    out_scale: torch.Tensor,
+    out_zero_point: torch.Tensor,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_softmax.per_tensor")
+def quantized_softmax_per_tensor_meta(
+    input: torch.Tensor,
+    mask: torch.Tensor,
+    dim: int,
+    in_scale: float,
+    in_zero_point: int,
+    out_scale: float,
+    out_zero_point: int,
+) -> torch.Tensor:
+    return input.new_empty(input.size(), dtype=input.dtype)
@@ -6,9 +6,10 @@
 
 # pyre-strict
 
-from typing import Any, Dict, List, Tuple
+from typing import Any, cast, Dict, List, Tuple
 
 import torch
+from executorch.backends.cadence.aot.compiler_utils import get_shape
 from executorch.backends.cadence.aot.quantizer.patterns import (
     AddmmPattern,
     AddPattern,
@@ -25,6 +26,7 @@
     MatmulPattern,
     ReluPattern0,
     ReluPattern1,
+    SoftmaxPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
     check_out_zero_point_is_min_range,
@@ -388,6 +390,73 @@ def get_args_and_kwargs_relu(
     return args, kwargs
 
 
+def get_args_and_kwargs_softmax(
+    graph_module: GraphModule,
+    inputs_inputs: List[fx.Node],
+    dequants_inputs: List[fx.Node],
+    quant_node: fx.Node,
+    op_node: fx.Node,
+) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
+    # Make a dummy mask tensor
+    mask_shape = get_shape(graph_module, cast(fx.Node, quant_node.args[0]))
+    mask_shape = list(mask_shape) if mask_shape else []
+    mask_shape[-1] = mask_shape[-1] // 16
+    mask_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            mask_shape,
+            0.0,
+        ),
+        {"dtype": torch.int32},
+    )
+    # Make the scale and zero_point tensors
+    in_scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[1],
+        ),
+        {"dtype": torch.float32},
+    )
+    in_zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            dequants_inputs[0].args[2],
+        ),
+        {"dtype": torch.int32},
+    )
+    out_scale_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            quant_node.args[1],
+        ),
+        {"dtype": torch.float32},
+    )
+    out_zero_point_tensor = graph_module.graph.call_function(
+        torch.ops.aten.full.default,
+        (
+            [1],
+            quant_node.args[2],
+        ),
+        {"dtype": torch.int32},
+    )
+
+    # Make the args and kwargs for the replacement op
+    args = (
+        inputs_inputs[0],
+        mask_tensor,
+        op_node.args[1],
+        in_scale_tensor,
+        in_zero_point_tensor,
+        out_scale_tensor,
+        out_zero_point_tensor,
+    )
+    kwargs = {}
+    return args, kwargs
+
+
 class QuantFusion(ExportPass):
     # pyre-ignore[2]: Parameter `patterns` has no type specified
     def __init__(self, patterns) -> None:
@@ -543,6 +612,14 @@ def call(self, graph_module: fx.GraphModule) -> PassResult:  # noqa: C901
                             dequants_inputs,
                             quant_node,
                         )
+                    elif isinstance(pattern, SoftmaxPattern):
+                        args, kwargs = get_args_and_kwargs_softmax(
+                            graph_module,
+                            inputs_inputs,
+                            dequants_inputs,
+                            quant_node,
+                            anchor_output_node,
+                        )
                     fused = graph_module.graph.call_function(
                         pattern.replacement_op(),
                         args,
 
@@ -485,3 +485,25 @@ def partition_types(self) -> List[OpOverload]:
 class Conv2dReluPattern1(ConvReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default]
+
+
+class SoftmaxPattern(QuantizationPattern):
+
+    def partition_types(self) -> List[OpOverload]:
+        return [torch.ops.aten._softmax.default]
+
+    def get_anchors(
+        self, gm: fx.GraphModule, fused_partition: List[fx.GraphModule]
+    ) -> PartitionAnchors:
+        # pyre-fixme[29]: `Union[BoundMethod[typing.Callable(torch._C.TensorBase.__ge...
+        softmax_node = fused_partition[0].nodes[-1]
+
+        return PartitionAnchors(
+            inputs=[(softmax_node, 0)],
+            weights=[],
+            biases=[],
+            output=[(softmax_node,)],
+        )
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_softmax.default
@@ -27,6 +27,7 @@
     QuantizationPattern,
     ReluPattern0,
     ReluPattern1,
+    SoftmaxPattern,
 )
 from executorch.backends.cadence.aot.quantizer.utils import (
     find_sequential_partitions_aten,
@@ -58,6 +59,15 @@
     observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
 )
 
+act_qspec_asym16s = QuantizationSpec(
+    dtype=torch.int16,
+    quant_min=-32768,
+    quant_max=32767,
+    qscheme=torch.per_tensor_affine,
+    is_dynamic=False,
+    observer_or_fake_quant_ctr=HistogramObserver.with_args(eps=2**-12),
+)
+
 wgt_qspec_asym8s = QuantizationSpec(
     dtype=torch.int8,
     quant_min=-128,
@@ -92,6 +102,13 @@
     None,
 )
 
+qconfig_A16 = QuantizationConfig(
+    act_qspec_asym16s,
+    act_qspec_asym16s,
+    wgt_qspec_asym8s,
+    None,
+)
+
 
 class CadenceAtenQuantizer(Quantizer):
     def __init__(
@@ -283,3 +300,15 @@ def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
         quantizers.append(CadenceAtenQuantizer(AddPattern(), qconfig_A8W8))
         quantizers.append(CadenceAtenQuantizer(CatPattern(), qconfig_A8W8))
         super().__init__(quantizers)
+
+
+class CadenceWithSoftmaxQuantizer(CadenceQuantizer):
+    """
+    Quantizer including A16 softmax
+    """
+
+    def __init__(self, quantizers: Optional[list[Quantizer]] = None) -> None:
+        if quantizers is None:
+            quantizers = get_cadence_default_quantizers()
+        quantizers.append(CadenceAtenQuantizer(SoftmaxPattern(), qconfig_A16))
+        super().__init__(quantizers)
@@ -2250,6 +2250,7 @@ class CommonReplacePasses:
         ReplaceMMWithAddMMPass,
         ReplaceRepeatWithCatPass,
         ReplaceFullLikeWithFullPass,
+        ReplaceAtenConvolutionWithCadenceConvolutionPass,
     ]
 
 
@@ -2282,7 +2283,6 @@ class CadenceReplaceOpsInGraph:
         RemoveNopSelectOpPass,
         ReplacePadWithCatPass,
         ReplaceConstantPadNdWithSlicePass,
-        ReplaceAtenConvolutionWithCadenceConvolutionPass,
         ReplaceConvWithChannelLastConvPass,
         ReplaceTrivialConvWithLinear,
         ReplaceConvWithIm2RowAndLinear,
 
@@ -589,7 +589,9 @@ class StaticAttentionIOManager {
   size_t prefill(
       executorch::runtime::Span<TokenT> tokens,
       executorch::runtime::Span<TokenT> input_buffer,
-      executorch::runtime::Method& method) {
+      executorch::runtime::Method& method,
+      std::function<void(executorch::runtime::Span<const float>)>
+          logits_callback = nullptr) {
     ET_LOG(Info, "Prefilling at position %zu", input_pos_);
     size_t input_len = input_buffer.size();
     auto& masks = get_mask(input_buffer.size());
@@ -610,6 +612,13 @@ class StaticAttentionIOManager {
           config_.k_cache_output_indices,
           config_.v_cache_output_indices,
           batch_len);
+      if (logits_callback) {
+        auto logits_tensor = method.get_output(0).toTensor();
+        auto* logits = logits_tensor.const_data_ptr<float>();
+        logits_callback(executorch::runtime::Span(
+            logits,
+            logits + batch_len * logits_tensor.size(logits_tensor.dim() - 1)));
+      }
     }
     return batch_len - 1;
   }
 
@@ -79,10 +79,7 @@ list(APPEND CMAKE_FIND_ROOT_PATH ${CMAKE_CURRENT_BINARY_DIR}/../../..)
 find_package(executorch CONFIG REQUIRED FIND_ROOT_PATH_BOTH)
 executorch_target_link_options_shared_lib(executorch)
 
-# llava_runner library
-add_subdirectory(runner)
-
-set(LINK_LIBS executorch gflags)
+set(LINK_LIBS executorch gflags extension_llm_runner)
 set(link_libraries ${LINK_LIBS})
 set(_srcs main.cpp)
 
@@ -204,5 +201,5 @@ if(NOT CMAKE_BUILD_TYPE STREQUAL "Debug")
 endif()
 
 target_include_directories(llava_main PUBLIC ${_common_include_directories})
-target_link_libraries(llava_main PUBLIC llava_runner ${link_libraries})
+target_link_libraries(llava_main PUBLIC ${link_libraries})
 target_compile_options(llava_main PUBLIC ${_common_compile_options})