diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS
index 94ab6de0e29..65e6402f256 100644
--- a/backends/cadence/aot/TARGETS
+++ b/backends/cadence/aot/TARGETS
@@ -149,6 +149,8 @@ executorch_generated_lib(
         "//executorch/backends/cadence/generic/operators:dequantize_per_tensor",
         "//executorch/backends/cadence/generic/operators:quantize_per_tensor",
         "//executorch/backends/cadence/generic/operators:quantized_add_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv1d_ncl_out",
+        "//executorch/backends/cadence/generic/operators:quantized_conv1d_nlc_out",
         "//executorch/backends/cadence/generic/operators:quantized_conv2d_nchw_out",
         "//executorch/backends/cadence/generic/operators:quantized_conv2d_nhwc_out",
         "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out",
diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml
index d8024c0245a..3a218b20d73 100644
--- a/backends/cadence/aot/functions.yaml
+++ b/backends/cadence/aot/functions.yaml
@@ -250,6 +250,16 @@
     - arg_meta: null
       kernel_name: impl::generic::dequantize_per_tensor_asym32s_out
 
+- func: cadence::quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv1d_ncl_out
+
+- func: cadence::quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv1d_nlc_out
+
 - func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -419,6 +429,16 @@
     - arg_meta: null
       kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv1d_ncl_per_tensor_out
+
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::generic::quantized_conv1d_nlc_per_tensor_out
+
 - func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml
index bcab980abd6..3a563a6c188 100644
--- a/backends/cadence/aot/functions_hifi.yaml
+++ b/backends/cadence/aot/functions_hifi.yaml
@@ -350,6 +350,16 @@
     - arg_meta: null
       kernel_name: impl::HiFi::dequantize_per_tensor_asym16s_out
 
+- func: cadence::quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_ncl_out
+
+- func: cadence::quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_nlc_out
+
 - func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
@@ -430,6 +440,16 @@
     - arg_meta: null
       kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out
 
+- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_ncl_per_tensor_out
+
+- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
+  kernels:
+    - arg_meta: null
+      kernel_name: impl::HiFi::quantized_conv1d_nlc_per_tensor_out
+
 - func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)
   kernels:
     - arg_meta: null
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
index bd208d04739..01f1f06a9b6 100644
--- a/backends/cadence/aot/ops_registrations.py
+++ b/backends/cadence/aot/ops_registrations.py
@@ -235,6 +235,30 @@
 lib.define(
     "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
 )
+lib.define(
+    "quantized_conv1d_nlc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_ncl.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
+lib.define(
+    "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
+)
+lib.define(
+    "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)"
+)
 lib.define(
     "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)"
 )
@@ -934,6 +958,94 @@ def quantized_conv2d_nhwc_meta(
     return input.new_empty(output_size, dtype=input.dtype)
 
 
+@register_fake("cadence::quantized_conv1d_nlc")
+def quantized_conv1d_nlc_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+) -> torch.Tensor:
+    out_channels, *kernel_size, _ = weight.shape
+
+    in_size = input.shape
+    # Assert that the input tensor has at least 3 dimensions, and at most 6
+    assert len(in_size) > 2
+    assert len(in_size) < 6
+
+    # Compute the output tensor size
+    output_size = (
+        get_conv1d_output_size(
+            in_size,
+            out_channels,
+            stride[1],
+            padding[1],
+            dilation[1],
+            kernel_size[0],
+            True,
+        )
+        if len(in_size) == 3
+        else get_conv2d_output_size(
+            in_size, out_channels, stride, padding, dilation, kernel_size, True
+        )
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_ncl")
+def quantized_conv1d_ncl_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: torch.Tensor,
+    bias_scale: torch.Tensor,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: torch.Tensor,
+    out_shift: torch.Tensor,
+) -> torch.Tensor:
+    out_channels, _, *kernel_size = weight.shape
+
+    in_size = input.shape
+    # Assert that the input tensor has at least 3 dimensions, and at most 6
+    assert len(in_size) > 2
+    assert len(in_size) < 6
+
+    # Compute the output tensor size
+    output_size = (
+        get_conv1d_output_size(
+            in_size,
+            out_channels,
+            stride[1],
+            padding[1],
+            dilation[1],
+            kernel_size[0],
+            False,
+        )
+        if len(in_size) == 3
+        else get_conv2d_output_size(
+            in_size, out_channels, stride, padding, dilation, kernel_size, False
+        )
+    )
+
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_conv2d_nchw")
 def quantized_conv2d_nchw_meta(
     input: torch.Tensor,
@@ -2371,6 +2483,68 @@ def roi_align_box_processor_meta(
     return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8)
 
 
+@register_fake("cadence::quantized_conv1d_ncl.per_tensor")
+def quantized_conv1d_ncl_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    assert input.dim() == 3 and weight.dim() == 3
+    out_channels, _, kernel_size = weight.shape
+    output_size = get_conv1d_output_size(
+        input.shape,
+        out_channels,
+        stride[1],
+        padding[1],
+        dilation[1],
+        kernel_size,
+        False,
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
+@register_fake("cadence::quantized_conv1d_nlc.per_tensor")
+def quantized_conv1d_nlc_per_tensor_meta(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: Tuple[int],
+    padding: Tuple[int],
+    dilation: Tuple[int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    assert input.dim() == 3 and weight.dim() == 3
+    out_channels, _, kernel_size = weight.shape
+    output_size = get_conv1d_output_size(
+        input.shape,
+        out_channels,
+        stride[1],
+        padding[1],
+        dilation[1],
+        kernel_size,
+        True,
+    )
+    return input.new_empty(output_size, dtype=input.dtype)
+
+
 @register_fake("cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor")
 def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_meta(
     input: torch.Tensor,
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
index 9f67204fcf9..2a1d155e575 100644
--- a/backends/cadence/aot/quantizer/patterns.py
+++ b/backends/cadence/aot/quantizer/patterns.py
@@ -247,7 +247,7 @@ def get_anchors(
         )
 
     def replacement_op(self) -> OpOverload:
-        return torch.ops.cadence.quantized_conv2d_nchw.default
+        return torch.ops.cadence.quantized_conv1d_ncl.default
 
 
 class Conv2dPattern(QuantizationPattern):
@@ -459,29 +459,35 @@ def get_anchors(
             output=[(relu_node,)],  # Output is from the relu node
         )
 
-    def replacement_op(self) -> OpOverload:
-        return torch.ops.cadence.quantized_conv2d_nchw.default
-
 
 # Conv1d + regular relu op fusion
 class Conv1dReluPattern0(ConvReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv1d.default, torch.ops.aten.relu.default]
 
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_conv1d_ncl.default
 
 # Conv1d + alternate relu op fusion
 class Conv1dReluPattern1(ConvReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv1d.default, torch.ops.aten.relu_.default]
 
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_conv1d_ncl.default
 
 # Conv2d + regular relu op fusion
 class Conv2dReluPattern0(ConvReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv2d.default, torch.ops.aten.relu.default]
 
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_conv2d_nchw.default
 
 # Conv2d + alternate relu op fusion
 class Conv2dReluPattern1(ConvReluBasePattern):
     def partition_types(self) -> List[OpOverload]:
         return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default]
+
+    def replacement_op(self) -> OpOverload:
+        return torch.ops.cadence.quantized_conv2d_nchw.default
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
index 781f04ae1da..ab2911c7845 100644
--- a/backends/cadence/aot/ref_implementations.py
+++ b/backends/cadence/aot/ref_implementations.py
@@ -627,6 +627,116 @@ def quantized_conv_per_tensor(
     )
 
 
+@impl(m, "quantized_conv1d_nlc.per_tensor")
+def quantized_conv1d_nlc_per_tensor(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    """
+    Quantized convolution operation.
+
+    Args:
+        - input_tensor (Tensor): The activations tensor
+        - weight (Tensor): The weight tensor
+        - bias (Tensor): The bias tensor
+        - stride (Tuple[int]): The stride of the convolution
+        - padding (Tuple[int]): The padding of the convolution
+        - dilation (Tuple[int]): The dilation of the convolution
+        - groups (int): The number of groups
+        - in_zero_point (int): The quantized mapping of zero for the input
+        - weight_zero_point (int): The quantized mapping of zero for the weight
+        - bias_scale (float): The quantized bias scale
+        - output_scale (float): The scale of the output
+        - output_zero_point (int): The zero point of the output
+        - out_multiplier (int): Unused
+        - out_shift (int): Unused
+    """
+    return quantized_conv_per_tensor(
+        input_tensor,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out_multiplier,
+        out_shift,
+    )
+
+
+@impl(m, "quantized_conv1d_ncl.per_tensor")
+def quantized_conv1d_ncl_per_tensor(
+    input_tensor: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    stride: tuple[int, int],
+    padding: tuple[int, int],
+    dilation: tuple[int, int],
+    groups: int,
+    in_zero_point: int,
+    weight_zero_point: int,
+    bias_scale: float,
+    output_scale: float,
+    output_zero_point: int,
+    out_multiplier: int,
+    out_shift: int,
+) -> torch.Tensor:
+    """
+    Quantized convolution operation.
+
+    Args:
+        - input_tensor (Tensor): The activations tensor
+        - weight (Tensor): The weight tensor
+        - bias (Tensor): The bias tensor
+        - stride (Tuple[int]): The stride of the convolution
+        - padding (Tuple[int]): The padding of the convolution
+        - dilation (Tuple[int]): The dilation of the convolution
+        - groups (int): The number of groups
+        - in_zero_point (int): The quantized mapping of zero for the input
+        - weight_zero_point (int): The quantized mapping of zero for the weight
+        - bias_scale (float): The quantized bias scale
+        - output_scale (float): The scale of the output
+        - output_zero_point (int): The zero point of the output
+        - out_multiplier (int): Unused
+        - out_shift (int): Unused
+    """
+    if not input_tensor.is_contiguous(memory_format=torch.contiguous_format):
+        raise ValueError("Input tensor must be in NCL format")
+    return quantized_conv_per_tensor(
+        input_tensor,
+        weight,
+        bias,
+        stride,
+        padding,
+        dilation,
+        groups,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out_multiplier,
+        out_shift,
+    )
+
+
 @impl(m, "quantized_conv2d_nchw.per_tensor")
 def quantized_conv2d_nchw_per_tensor(
     input_tensor: torch.Tensor,
@@ -803,6 +913,40 @@ def variant(
 
             # Call the appropriate base function
             match layout:
+                case "ncl":
+                    return quantized_conv1d_ncl_per_tensor(
+                        input_tensor,
+                        weight,
+                        bias,
+                        stride,
+                        padding,
+                        dilation,
+                        groups,
+                        in_zero_point,
+                        weight_zero_point,
+                        bias_scale,
+                        output_scale,
+                        output_zero_point,
+                        out_multiplier,
+                        out_shift,
+                    )
+                case "nlc":
+                    return quantized_conv1d_nlc_per_tensor(
+                        input_tensor,
+                        weight,
+                        bias,
+                        stride,
+                        padding,
+                        dilation,
+                        groups,
+                        in_zero_point,
+                        weight_zero_point,
+                        bias_scale,
+                        output_scale,
+                        output_zero_point,
+                        out_multiplier,
+                        out_shift,
+                    )
                 case "nchw":
                     return quantized_conv2d_nchw_per_tensor(
                         input_tensor,
@@ -914,22 +1058,22 @@ def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> (
 
 
 @impl(m, "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor")
-@quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True)
+@quantized_conv_variant("ncl", torch.int8, torch.int8, is_1d=True)
 def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor")
-@quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True)
+@quantized_conv_variant("ncl", torch.uint8, torch.uint8, is_1d=True)
 def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor")
-@quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True)
+@quantized_conv_variant("nlc", torch.int8, torch.int8, is_1d=True)
 def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ...
 
 
 @impl(m, "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor")
-@quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True)
+@quantized_conv_variant("nlc", torch.uint8, torch.uint8, is_1d=True)
 def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ...
 
 
diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py
index bf0657315a3..a92c99c35c5 100644
--- a/backends/cadence/aot/replace_ops.py
+++ b/backends/cadence/aot/replace_ops.py
@@ -787,6 +787,8 @@ class ReplaceTrivialConvWithLinear(ExportPass):
 
     trivial_conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = {
         exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default,
+        exir_ops.edge.cadence.quantized_conv1d_ncl.default: exir_ops.edge.cadence.quantized_linear.default,
+        exir_ops.edge.cadence.quantized_conv1d_nlc.default: exir_ops.edge.cadence.quantized_linear.default,
         exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default,
         exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default,
     }
@@ -799,10 +801,7 @@ def call_operator(self, op, args, kwargs, meta):
         # and quantized_conv have the same first 8 args. The quantized op has
         # extra args holding at least the zero point and scale of input, weight, bias,
         # and output tensor.
-        quantized_op = (
-            op == exir_ops.edge.cadence.quantized_conv2d_nchw.default
-            or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default
-        )
+        quantized_op = op != exir_ops.edge.cadence.convolution.default
         assert (len(args) == 8 and not quantized_op) or (
             len(args) >= 12 and quantized_op
         ), "Inconsistent args for convolution"
@@ -979,11 +978,12 @@ def call_operator(
     ) -> ProxyValue:
         if op not in {
             exir_ops.edge.cadence.convolution.default,
+            exir_ops.edge.cadence.quantized_conv1d_ncl.default,
             exir_ops.edge.cadence.quantized_conv2d_nchw.default,
         }:
             return super().call_operator(op, args, kwargs, meta)
 
-        quantized_op = op == exir_ops.edge.cadence.quantized_conv2d_nchw.default
+        quantized_op = op != exir_ops.edge.cadence.convolution.default
 
         if not quantized_op and len(args) == 8 and args[-1] is True:
             # Already in NHWC layout.
@@ -1067,6 +1067,8 @@ class ReplaceConvWithIm2RowAndLinear(ExportPass):
     # decompose to.
     conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = {
         exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default,
+        exir_ops.edge.cadence.quantized_conv1d_ncl.default: exir_ops.edge.cadence.quantized_linear.default,
+        exir_ops.edge.cadence.quantized_conv1d_nlc.default: exir_ops.edge.cadence.quantized_linear.default,
         exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default,
         exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default,
     }
@@ -1076,10 +1078,7 @@ def call_operator(self, op, args, kwargs, meta):
             return super().call_operator(op, args, kwargs, meta)
 
         # Get the relevant args from convolution node.
-        quantized_op = (
-            op == exir_ops.edge.cadence.quantized_conv2d_nchw.default
-            or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default
-        )
+        quantized_op = op != exir_ops.edge.cadence.convolution.default
         assert (len(args) == 8 and not quantized_op) or (
             len(args) >= 12 and quantized_op
         ), "Inconsistent args for convolution"
@@ -1622,6 +1621,14 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass):
             exir_ops.edge.cadence.quantized_add.per_tensor,
             [1, 2, 4, 5],
         ),
+        exir_ops.edge.cadence.quantized_conv1d_ncl: (
+            exir_ops.edge.cadence.quantized_conv1d_ncl.per_tensor,
+            [8, 9, 12, 13],
+        ),
+        exir_ops.edge.cadence.quantized_conv1d_nlc: (
+            exir_ops.edge.cadence.quantized_conv1d_nlc.per_tensor,
+            [8, 9, 12, 13],
+        ),
         exir_ops.edge.cadence.quantized_conv2d_nchw: (
             exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor,
             [8, 9, 12, 13],
diff --git a/backends/cadence/generic/operators/quantized_conv1d_ncl_out.cpp b/backends/cadence/generic/operators/quantized_conv1d_ncl_out.cpp
new file mode 100644
index 00000000000..910e26db606
--- /dev/null
+++ b/backends/cadence/generic/operators/quantized_conv1d_ncl_out.cpp
@@ -0,0 +1,368 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/operators.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+// This implements a generic 2d conv kernel that operates on raw pointers.
+// The version handles both quantized and fp32 convolutions.
+// The input is of shape [n x c x h x w]
+// The weight is of shape [oc x wc x wh x ww], where wc == c
+// The output is of shape [n x oc x oh x ow]
+// The bias is of shape [oc]
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv2d_nchw_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t c,
+    int32_t h,
+    int32_t w,
+    int32_t oc,
+    int32_t wc,
+    int32_t wh,
+    int32_t ww,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * c * h * w;
+    OT* out_batch = p_out + _n * oc * oh * ow;
+    // Compute separable convolution for each group
+    for (int _g = 0; _g < groups; ++_g) {
+      // Identify the input and output channels involved in the computation
+      // of this group
+      int sic = _g * icpg;
+      int soc = _g * ocpg;
+      // Populate all the output channels in the group
+      for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+        OT* out_plane = out_batch + _oc * oh * ow;
+        const WT* weight_batch = p_weight + _oc * wc * wh * ww;
+        // We compute one output channel at a time. The computation can be
+        // thought of as a stencil computation: we iterate over an input of size
+        // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an
+        // output channel of size 1 x oh x ow.
+        for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+          for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to the
+            // output channel being computed) with the corresponding weight
+            // channel.
+            // If the padding is 0, and dilation is 1, then we can remove the
+            // unnecessary checks, and simplify the code so that it can be
+            // vectorized by Tensilica compiler.
+            if (zero_pad_unit_dilation) {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    int ioff = (_h + _wh) * w + (_w + _ww);
+                    int woff = _wh * ww + _ww;
+                    float lhs = in_plane[ioff] - in_zero_point;
+                    float rhs = weight_plane[woff] -
+                        (quantized ? weight_zero_point : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                const IT* in_plane = in_batch + _ic * h * w;
+                const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww;
+                for (int _wh = 0; _wh < wh; ++_wh) {
+                  for (int _ww = 0; _ww < ww; ++_ww) {
+                    if (((_h + d0 * _wh - p0) >= 0) &&
+                        ((_h + d0 * _wh - p0) < h) &&
+                        ((_w + d1 * _ww - p1) >= 0) &&
+                        ((_w + d1 * _ww - p1) < w)) {
+                      int ioff =
+                          (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1);
+                      int woff = _wh * ww + _ww;
+                      float lhs = in_plane[ioff] - in_zero_point;
+                      float rhs = weight_plane[woff] -
+                          (quantized ? weight_zero_point : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val = bias_scale * acc;
+              out_plane[_oh * ow + _ow] =
+                  ::impl::generic::kernels::quantize<OT>(
+                      val, inv_out_scale, out_zero_point);
+            } else {
+              out_plane[_oh * ow + _ow] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+// The quantized convolution kernel. in_scale and weight_scale are implicit in
+// bias_scale, since it is a product of the two. The kernel will branch to
+// quantized::conv1d or quantized::conv2d based on the dimensionality of
+// activation tensor.
+void quantized_conv1d_ncl(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, c, h, w]
+  const int n = input.size(0);
+  const int c = input.size(1);
+  const int h = conv1d ? 1 : input.size(2);
+  const int w = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wc, wh, ww]
+  const int oc = weight.size(0);
+  const int wc = weight.size(1);
+  const int wh = conv1d ? 1 : weight.size(2);
+  const int ww = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oc, oh, ow]
+  const int oh = conv1d ? 1 : out.size(2);
+  const int ow = conv1d ? out.size(2) : out.size(3);
+
+#define typed_quantized_conv1d_ncl(ctype, dtype)                  \
+  case ScalarType::dtype: {                                       \
+    conv2d_nchw_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                            \
+        weight.const_data_ptr<ctype>(),                           \
+        bias.const_data_ptr<int32_t>(),                           \
+        out.mutable_data_ptr<ctype>(),                            \
+        n,                                                        \
+        c,                                                        \
+        h,                                                        \
+        w,                                                        \
+        oc,                                                       \
+        wc,                                                       \
+        wh,                                                       \
+        ww,                                                       \
+        oh,                                                       \
+        ow,                                                       \
+        stride[0],                                                \
+        stride[1],                                                \
+        padding[0],                                               \
+        padding[1],                                               \
+        dilation[0],                                              \
+        dilation[1],                                              \
+        groups,                                                   \
+        in_zero_point,                                            \
+        weight_zero_point,                                        \
+        bias_scale,                                               \
+        output_scale,                                             \
+        (ctype)output_zero_point);                                \
+    break;                                                        \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_ncl);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv1d_ncl
+}
+
+void quantized_conv1d_ncl_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED const Tensor& out_multiplier,
+    __ET_UNUSED const Tensor& out_shift,
+    Tensor& out) {
+  const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
+  const int32_t weight_zero_point_int =
+      weight_zero_point.const_data_ptr<int32_t>()[0];
+  quantized_conv1d_ncl(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point_int,
+      bias_scale_float,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_ncl_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv1d_ncl(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv1d_ncl(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv1d_ncl(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/quantized_conv1d_nlc_out.cpp b/backends/cadence/generic/operators/quantized_conv1d_nlc_out.cpp
new file mode 100644
index 00000000000..742d9173998
--- /dev/null
+++ b/backends/cadence/generic/operators/quantized_conv1d_nlc_out.cpp
@@ -0,0 +1,355 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/generic/kernels/kernels.h>
+#include <executorch/backends/cadence/generic/operators/operators.h>
+
+namespace impl {
+namespace generic {
+namespace native {
+
+using ::executorch::aten::IntArrayRef;
+using ::executorch::aten::ScalarType;
+using ::executorch::aten::Tensor;
+using ::executorch::runtime::KernelRuntimeContext;
+
+template <
+    typename IT = float,
+    typename WT = IT,
+    typename BT = IT,
+    typename OT = IT,
+    bool quantized = false>
+__attribute__((noinline)) void conv2d_nhwc_core_generic(
+    // All the arrays
+    const IT* __restrict__ p_in,
+    const WT* __restrict__ p_weight,
+    const BT* __restrict__ p_bias,
+    OT* __restrict__ p_out,
+    // The array sizes
+    int32_t n,
+    int32_t h,
+    int32_t w,
+    int32_t c,
+    int32_t oc,
+    int32_t wh,
+    int32_t ww,
+    int32_t wc,
+    int32_t oh,
+    int32_t ow,
+    // Stride
+    int16_t s0,
+    int16_t s1,
+    // Padding
+    int16_t p0,
+    int16_t p1,
+    // Dilation
+    int16_t d0,
+    int16_t d1,
+    // Group for depthwise conv
+    int16_t groups,
+    // Optional args that are only relevant for quantized convolution
+    // input zero point
+    IT in_zero_point = 0,
+    // weight zero point
+    int32_t weight_zero_point = 0,
+    float bias_scale = 1,
+    float out_scale = 1,
+    OT out_zero_point = 0) {
+  float inv_out_scale = 1. / out_scale;
+  bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0;
+
+  // Compute the number of in and out channels per group
+  const int ocpg = oc / groups;
+  const int icpg = c / groups;
+
+  // Iterate over all the output batches (i.e., n)
+  for (int _n = 0; _n < n; ++_n) {
+    const IT* in_batch = p_in + _n * h * w * c;
+    OT* out_batch = p_out + _n * oh * ow * oc;
+    for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) {
+      for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) {
+        OT* out_line = out_batch + (_oh * ow + _ow) * oc;
+        // Compute separable convolution for each group
+        for (int _g = 0; _g < groups; ++_g) {
+          // Identify the input and output channels involved in the computation
+          // of this group
+          int sic = _g * icpg;
+          int soc = _g * ocpg;
+          // Populate all the output channels in the group
+          for (int _oc = soc; _oc < soc + ocpg; ++_oc) {
+            const WT* weight_batch = p_weight + _oc * wh * ww * wc;
+            // We compute one output channel at a time. The computation can be
+            // thought of as a stencil computation: we iterate over an input of
+            // size h x w x icpg, with a stencil of size wh x ww x icpg, to
+            // compute an output channel of size oh x ow x 1.
+            float acc = p_bias[_oc];
+            // Below is the stencil computation that performs the hadamard
+            // product+accumulation of each input channel (contributing to
+            // the output channel being computed) with the corresponding
+            // weight channel. If the padding is 0, and dilation is 1, then
+            // we can remove the unnecessary checks, and simplify the code
+            // so that it can be vectorized by Tensilica compiler.x``
+            if (zero_pad_unit_dilation) {
+              for (int _wh = 0; _wh < wh; ++_wh) {
+                for (int _ww = 0; _ww < ww; ++_ww) {
+                  const IT* in_line =
+                      in_batch + (_h + _wh) * w * c + (_w + _ww) * c;
+                  const WT* weight_line =
+                      weight_batch + _wh * ww * wc + _ww * wc;
+                  for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                    float lhs = in_line[_ic] - in_zero_point;
+                    float rhs = weight_line[_ic - sic] -
+                        (quantized ? weight_zero_point : 0);
+                    acc += lhs * rhs;
+                  }
+                }
+              }
+            } else {
+              for (int _wh = 0; _wh < wh; ++_wh) {
+                for (int _ww = 0; _ww < ww; ++_ww) {
+                  if (((_h + d0 * _wh - p0) >= 0) &&
+                      ((_h + d0 * _wh - p0) < h) &&
+                      ((_w + d1 * _ww - p1) >= 0) &&
+                      ((_w + d1 * _ww - p1 < w))) {
+                    const IT* in_line = in_batch +
+                        (_h + d0 * _wh - p0) * w * c + (_w + d1 * _ww - p1) * c;
+                    const WT* weight_line =
+                        weight_batch + _wh * ww * wc + _ww * wc;
+                    for (int _ic = sic; _ic < sic + icpg; ++_ic) {
+                      float lhs = in_line[_ic] - in_zero_point;
+                      float rhs = weight_line[_ic - sic] -
+                          (quantized ? weight_zero_point : 0);
+                      acc += lhs * rhs;
+                    }
+                  }
+                }
+              }
+            }
+            if (quantized) {
+              float val = bias_scale * acc;
+              out_line[_oc] = ::impl::generic::kernels::quantize<OT>(
+                  val, inv_out_scale, out_zero_point);
+            } else {
+              out_line[_oc] = acc;
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+void quantized_conv1d_nlc(
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int16_t groups,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  bool conv1d = input.dim() == 3;
+  // input = [n, h, w, c]
+  const int n = input.size(0);
+  const int h = conv1d ? 1 : input.size(1);
+  const int w = conv1d ? input.size(1) : input.size(2);
+  const int c = conv1d ? input.size(2) : input.size(3);
+  // weight = [oc, wh, ww, wc]
+  const int oc = weight.size(0);
+  const int wh = conv1d ? 1 : weight.size(1);
+  const int ww = conv1d ? weight.size(1) : weight.size(2);
+  const int wc = conv1d ? weight.size(2) : weight.size(3);
+  // output = [n, oh, ow, oc]
+  const int oh = conv1d ? 1 : out.size(1);
+  const int ow = conv1d ? out.size(1) : out.size(2);
+
+#define typed_quantized_conv1d_nlc(ctype, dtype)                  \
+  case ScalarType::dtype: {                                       \
+    conv2d_nhwc_core_generic<ctype, ctype, int32_t, ctype, true>( \
+        input.const_data_ptr<ctype>(),                            \
+        weight.const_data_ptr<ctype>(),                           \
+        bias.const_data_ptr<int32_t>(),                           \
+        out.mutable_data_ptr<ctype>(),                            \
+        n,                                                        \
+        h,                                                        \
+        w,                                                        \
+        c,                                                        \
+        oc,                                                       \
+        wh,                                                       \
+        ww,                                                       \
+        wc,                                                       \
+        oh,                                                       \
+        ow,                                                       \
+        stride[0],                                                \
+        stride[1],                                                \
+        padding[0],                                               \
+        padding[1],                                               \
+        dilation[0],                                              \
+        dilation[1],                                              \
+        groups,                                                   \
+        in_zero_point,                                            \
+        weight_zero_point,                                        \
+        bias_scale,                                               \
+        output_scale,                                             \
+        (ctype)output_zero_point);                                \
+    break;                                                        \
+  }
+  ScalarType dtype = out.scalar_type();
+  switch (dtype) {
+    ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_nlc);
+    default:
+      ET_DCHECK_MSG(
+          false, "Unhandled dtype %s", torch::executor::toString(dtype));
+  }
+
+#undef typed_quantized_conv1d_nlc
+}
+
+void quantized_conv1d_nlc_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    const Tensor& weight_zero_point,
+    const Tensor& bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED const Tensor& out_multiplier,
+    __ET_UNUSED const Tensor& out_shift,
+    Tensor& out) {
+  const float bias_scale_float = bias_scale.const_data_ptr<float>()[0];
+  const int32_t weight_zero_point_int =
+      weight_zero_point.const_data_ptr<int32_t>()[0];
+  quantized_conv1d_nlc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point_int,
+      bias_scale_float,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_nlc_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv1d_nlc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv1d_nlc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
+    __ET_UNUSED KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    IntArrayRef dilation,
+    int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  quantized_conv1d_nlc(
+      input,
+      weight,
+      bias,
+      stride,
+      padding,
+      dilation,
+      groups,
+      in_zero_point,
+      weight_zero_point,
+      bias_scale,
+      output_scale,
+      output_zero_point,
+      out);
+}
+
+} // namespace native
+} // namespace generic
+} // namespace impl
diff --git a/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
index fbb01c82e65..5d00cda17ba 100644
--- a/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
+++ b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp
@@ -496,72 +496,6 @@ void quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv2d_nchw(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
-}
-
-void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv2d_nchw(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
-}
-
 } // namespace native
 } // namespace generic
 } // namespace impl
diff --git a/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
index eca836dcc94..fce8dff61e4 100644
--- a/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
+++ b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp
@@ -417,72 +417,6 @@ void quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out(
       out);
 }
 
-void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv2d_nhwc(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
-}
-
-void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out(
-    __ET_UNUSED KernelRuntimeContext& ctx,
-    const Tensor& input,
-    const Tensor& weight,
-    const Tensor& bias,
-    IntArrayRef stride,
-    IntArrayRef padding,
-    IntArrayRef dilation,
-    int64_t groups,
-    int64_t in_zero_point,
-    int64_t weight_zero_point,
-    double bias_scale,
-    double output_scale,
-    int64_t output_zero_point,
-    __ET_UNUSED int64_t out_multiplier,
-    __ET_UNUSED int64_t out_shift,
-    Tensor& out) {
-  quantized_conv2d_nhwc(
-      input,
-      weight,
-      bias,
-      stride,
-      padding,
-      dilation,
-      groups,
-      in_zero_point,
-      weight_zero_point,
-      bias_scale,
-      output_scale,
-      output_zero_point,
-      out);
-}
-
 void quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out(
     __ET_UNUSED KernelRuntimeContext& ctx,
     const Tensor& input,
diff --git a/backends/cadence/generic/operators/targets.bzl b/backends/cadence/generic/operators/targets.bzl
index fa0f128b229..3dc2483979e 100644
--- a/backends/cadence/generic/operators/targets.bzl
+++ b/backends/cadence/generic/operators/targets.bzl
@@ -78,6 +78,36 @@ def define_common_targets():
         ],
     )
 
+    runtime.cxx_library(
+        name = "quantized_conv1d_ncl_out",
+        srcs = ["quantized_conv1d_ncl_out.cpp"],
+        exported_headers = ["operators.h"],
+        platforms = CXX,
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/backends/cadence/generic/kernels:cadence_kernels",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
+    runtime.cxx_library(
+        name = "quantized_conv1d_nlc_out",
+        srcs = ["quantized_conv1d_nlc_out.cpp"],
+        exported_headers = ["operators.h"],
+        platforms = CXX,
+        deps = [
+            "//executorch/runtime/kernel:kernel_includes",
+            "//executorch/backends/cadence/generic/kernels:cadence_kernels",
+        ],
+        visibility = [
+            "//executorch/backends/cadence/...",
+            "@EXECUTORCH_CLIENTS",
+        ],
+    )
+
     runtime.cxx_library(
         name = "quantized_conv2d_nchw_out",
         srcs = ["quantized_conv2d_nchw_out.cpp"],
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_per_tensor_out.cpp
new file mode 100644
index 00000000000..eadd2ec92c3
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_per_tensor_out.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Optimized NCHW 1D convolution for int8 x int8 -> int8
+void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  constexpr int kNnlibMaxDim = 3;
+
+  WORD8* __restrict__ p_out =
+      (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
+  WORD8* __restrict__ p_inp =
+      (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+  WORD8* __restrict__ p_kernel =
+      (WORD8* __restrict__)weight.const_data_ptr<int8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_channels = weight.size(1);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  WORD32 out_shift32 = 0;
+  WORD32 kernel_zero_bias = -weight_zero_point;
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 1;
+  WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory(
+      ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8));
+  WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory(
+      ctx,
+      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8));
+  WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8);
+  WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8);
+
+  WORD32 p_inp_shape[kNnlibMaxDim];
+  p_inp_shape[0] = batches;
+  p_inp_shape[1] = input_channels;
+  p_inp_shape[2] = input_width;
+
+  WORD32 p_out_shape[kNnlibMaxDim];
+  p_out_shape[0] = batches;
+  p_out_shape[1] = input_width;
+  p_out_shape[2] = input_channels;
+
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1};
+
+  xa_nn_transpose_8_8(
+      pin,
+      p_out_shape,
+      p_inp,
+      p_inp_shape,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 p_inp_shape1[kNnlibMaxDim];
+  p_inp_shape1[0] = out_channels;
+  p_inp_shape1[1] = kernel_channels;
+  p_inp_shape1[2] = kernel_width;
+
+  WORD32 p_out_shape1[kNnlibMaxDim];
+  p_out_shape1[0] = out_channels;
+  p_out_shape1[1] = kernel_width;
+  p_out_shape1[2] = kernel_channels;
+
+  xa_nn_transpose_8_8(
+      pkernel,
+      p_out_shape1,
+      p_kernel,
+      p_inp_shape1,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    WORD8* in_batch = pin + _n * input_channels * input_width;
+    WORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8xasym8(
+        (UWORD8*)out_batch,
+        (UWORD8*)in_batch,
+        (UWORD8*)pkernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8
+void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  constexpr int kNnlibMaxDim = 3;
+
+  UWORD8* __restrict__ p_out =
+      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_inp =
+      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_kernel =
+      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_channels = weight.size(1);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  WORD32 out_shift32 = 0;
+  WORD32 kernel_zero_bias = -weight_zero_point;
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 1;
+  UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory(
+      ctx, ((batches * input_channels * input_width) + 8) * sizeof(UWORD8));
+  UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory(
+      ctx,
+      ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(UWORD8));
+  UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8);
+  UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8);
+
+  WORD32 p_inp_shape[kNnlibMaxDim];
+  p_inp_shape[0] = batches;
+  p_inp_shape[1] = input_channels;
+  p_inp_shape[2] = input_width;
+
+  WORD32 p_out_shape[kNnlibMaxDim];
+  p_out_shape[0] = batches;
+  p_out_shape[1] = input_width;
+  p_out_shape[2] = input_channels;
+
+  WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1};
+
+  xa_nn_transpose_8_8(
+      (WORD8*)pin,
+      p_out_shape,
+      (WORD8*)p_inp,
+      p_inp_shape,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 p_inp_shape1[kNnlibMaxDim];
+  p_inp_shape1[0] = out_channels;
+  p_inp_shape1[1] = kernel_channels;
+  p_inp_shape1[2] = kernel_width;
+
+  WORD32 p_out_shape1[kNnlibMaxDim];
+  p_out_shape1[0] = out_channels;
+  p_out_shape1[1] = kernel_width;
+  p_out_shape1[2] = kernel_channels;
+
+  xa_nn_transpose_8_8(
+      (WORD8*)pkernel,
+      p_out_shape1,
+      (WORD8*)p_kernel,
+      p_inp_shape1,
+      p_permute_vec,
+      kNnlibMaxDim,
+      kNnlibMaxDim);
+
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    UWORD8* in_batch = pin + _n * input_channels * input_width;
+    UWORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8uxasym8u(
+        out_batch,
+        in_batch,
+        pkernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+void quantized_conv1d_ncl_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    __ET_UNUSED int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  if (input.scalar_type() == ScalarType::Byte) {
+    xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == ScalarType::Char) {
+    xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out); 
+  } else {
+    ET_CHECK_MSG(false, "Unhandled input type %hhd", input.scalar_type());
+  }
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_per_tensor_out.cpp
new file mode 100644
index 00000000000..8cbccef773e
--- /dev/null
+++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_per_tensor_out.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include <executorch/backends/cadence/hifi/kernels/kernels.h>
+#include <executorch/backends/cadence/hifi/operators/operators.h>
+#include <executorch/runtime/kernel/kernel_includes.h>
+
+#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1)))
+
+using Tensor = executorch::aten::Tensor;
+using KernelRuntimeContext = torch::executor::KernelRuntimeContext;
+using ScalarType = executorch::aten::ScalarType;
+using ::executorch::aten::IntArrayRef;
+
+namespace impl {
+namespace HiFi {
+namespace native {
+
+// Optimized NHWC 1D convolution for int8 x int8 -> int8
+void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  WORD8* __restrict__ p_out =
+      (WORD8* __restrict__)out.mutable_data_ptr<int8_t>();
+  WORD8* __restrict__ p_inp =
+      (WORD8* __restrict__)input.const_data_ptr<int8_t>();
+  WORD8* __restrict__ p_kernel =
+      (WORD8* __restrict__)weight.const_data_ptr<int8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  WORD32 out_shift32 = 0;
+  WORD32 kernel_zero_bias = -weight_zero_point;
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 0;
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)::impl::HiFi::kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    WORD8* in_batch = p_inp + _n * input_channels * input_width;
+    WORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8xasym8(
+        (UWORD8*)out_batch,
+        (UWORD8*)in_batch,
+        (UWORD8*)p_kernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8
+void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    int32_t in_zero_point,
+    int32_t weight_zero_point,
+    float bias_scale,
+    float output_scale,
+    int32_t output_zero_point,
+    Tensor& out) {
+  UWORD8* __restrict__ p_out =
+      (UWORD8* __restrict__)out.mutable_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_inp =
+      (UWORD8* __restrict__)input.const_data_ptr<uint8_t>();
+  UWORD8* __restrict__ p_kernel =
+      (UWORD8* __restrict__)weight.const_data_ptr<uint8_t>();
+  WORD32* __restrict__ p_bias =
+      (WORD32* __restrict__)bias.const_data_ptr<int32_t>();
+
+  WORD32 batches = input.size(0);
+  WORD32 input_channels = input.size(1);
+  WORD32 input_width = input.size(2);
+  WORD32 out_channels = weight.size(0);
+  WORD32 kernel_width = weight.size(2);
+  WORD32 out_width = out.size(2);
+  WORD32 x_stride = stride[1];
+  WORD32 x_padding = padding[1];
+  WORD32 input_zero_bias = -in_zero_point;
+  WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648;
+  WORD32 out_shift32 = 0;
+  WORD32 kernel_zero_bias = -weight_zero_point;
+
+  WORD32 out_zero_bias = output_zero_point;
+  WORD32 out_data_format = 0;
+  WORD32 scratch_size =
+      xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8);
+  scratch_size = scratch_size < 0 ? 0 : scratch_size;
+  WORD32* ptr_scratch =
+      (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size);
+  pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8);
+
+  for (int _n = 0; _n < batches; _n++) {
+    UWORD8* in_batch = p_inp + _n * input_channels * input_width;
+    UWORD8* out_batch = p_out + _n * out_channels * out_width;
+
+    xa_nn_conv1d_std_asym8uxasym8u(
+        out_batch,
+        in_batch,
+        p_kernel,
+        p_bias,
+        1,
+        input_width,
+        input_channels,
+        kernel_width,
+        out_channels,
+        x_stride,
+        x_padding,
+        out_width,
+        input_zero_bias,
+        kernel_zero_bias,
+        out_multiplier32,
+        out_shift32,
+        out_zero_bias,
+        out_data_format,
+        p_scratch);
+  }
+}
+
+void quantized_conv1d_nlc_per_tensor_out(
+    KernelRuntimeContext& ctx,
+    const Tensor& input,
+    const Tensor& weight,
+    const Tensor& bias,
+    IntArrayRef stride,
+    IntArrayRef padding,
+    __ET_UNUSED IntArrayRef dilation,
+    __ET_UNUSED int64_t groups,
+    int64_t in_zero_point,
+    int64_t weight_zero_point,
+    double bias_scale,
+    double output_scale,
+    int64_t output_zero_point,
+    __ET_UNUSED int64_t out_multiplier,
+    __ET_UNUSED int64_t out_shift,
+    Tensor& out) {
+  if (input.scalar_type() == ScalarType::Byte) {
+    xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out);
+  } else if (input.scalar_type() == ScalarType::Char) {
+    xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s(
+        ctx,
+        input,
+        weight,
+        bias,
+        stride,
+        padding,
+        in_zero_point,
+        weight_zero_point,
+        bias_scale,
+        output_scale,
+        output_zero_point,
+        out);
+  } else {
+    ET_CHECK_MSG(false, "Unhandled input type %hhd", input.scalar_type());
+  }
+}
+
+} // namespace native
+} // namespace HiFi
+} // namespace impl
diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl
index 1f9814c4a4e..547d89fc5c2 100644
--- a/backends/cadence/hifi/operators/targets.bzl
+++ b/backends/cadence/hifi/operators/targets.bzl
@@ -67,6 +67,7 @@ OPERATORS = [
     "quantized_conv2d_nchw_out",
     "quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv1d_ncl_per_tensor_out",
     "quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out",
@@ -76,6 +77,7 @@ OPERATORS = [
     "quantized_conv2d_nhwc_out",
     "quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out",
+    "quantized_conv1d_nlc_per_tensor_out",
     "quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out",
     "quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out",
     "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out",