diff --git a/backends/cadence/aot/TARGETS b/backends/cadence/aot/TARGETS index 94ab6de0e29..65e6402f256 100644 --- a/backends/cadence/aot/TARGETS +++ b/backends/cadence/aot/TARGETS @@ -149,6 +149,8 @@ executorch_generated_lib( "//executorch/backends/cadence/generic/operators:dequantize_per_tensor", "//executorch/backends/cadence/generic/operators:quantize_per_tensor", "//executorch/backends/cadence/generic/operators:quantized_add_out", + "//executorch/backends/cadence/generic/operators:quantized_conv1d_ncl_out", + "//executorch/backends/cadence/generic/operators:quantized_conv1d_nlc_out", "//executorch/backends/cadence/generic/operators:quantized_conv2d_nchw_out", "//executorch/backends/cadence/generic/operators:quantized_conv2d_nhwc_out", "//executorch/backends/cadence/generic/operators:quantized_fully_connected_out", diff --git a/backends/cadence/aot/functions.yaml b/backends/cadence/aot/functions.yaml index d8024c0245a..3a218b20d73 100644 --- a/backends/cadence/aot/functions.yaml +++ b/backends/cadence/aot/functions.yaml @@ -250,6 +250,16 @@ - arg_meta: null kernel_name: impl::generic::dequantize_per_tensor_asym32s_out +- func: cadence::quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::generic::quantized_conv1d_ncl_out + +- func: cadence::quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::generic::quantized_conv1d_nlc_out + - func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -419,6 +429,16 @@ - arg_meta: null kernel_name: impl::generic::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out +- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::generic::quantized_conv1d_ncl_per_tensor_out + +- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::generic::quantized_conv1d_nlc_per_tensor_out + - func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index bcab980abd6..3a563a6c188 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -350,6 +350,16 @@ - arg_meta: null kernel_name: impl::HiFi::dequantize_per_tensor_asym16s_out +- func: cadence::quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_ncl_out + +- func: cadence::quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_nlc_out + - func: cadence::quantized_conv2d_nchw.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -430,6 +440,16 @@ - arg_meta: null kernel_name: impl::HiFi::quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor_out +- func: cadence::quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_ncl_per_tensor_out + +- func: cadence::quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: impl::HiFi::quantized_conv1d_nlc_per_tensor_out + - func: cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py index bd208d04739..01f1f06a9b6 100644 --- a/backends/cadence/aot/ops_registrations.py +++ b/backends/cadence/aot/ops_registrations.py @@ -235,6 +235,30 @@ lib.define( "quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" ) +lib.define( + "quantized_conv1d_nlc(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nlc.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_ncl(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_ncl.out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, Tensor weight_zero_point, Tensor bias_scale, float out_scale, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_ncl.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_ncl.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) +lib.define( + "quantized_conv1d_nlc.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" +) +lib.define( + "quantized_conv1d_nlc.per_tensor_out(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift, *, Tensor(a!) out) -> Tensor(a!)" +) lib.define( "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor(Tensor input, Tensor weight, Tensor bias, int[] stride, SymInt[] padding, int[] dilation, int groups, int input_zero_point, int weight_zero_point, float bias_scale, float out_scale, int out_zero_point, int out_multiplier, int out_shift) -> (Tensor Z)" ) @@ -934,6 +958,94 @@ def quantized_conv2d_nhwc_meta( return input.new_empty(output_size, dtype=input.dtype) +@register_fake("cadence::quantized_conv1d_nlc") +def quantized_conv1d_nlc_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: torch.Tensor, + bias_scale: torch.Tensor, + output_scale: float, + output_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: + out_channels, *kernel_size, _ = weight.shape + + in_size = input.shape + # Assert that the input tensor has at least 3 dimensions, and at most 6 + assert len(in_size) > 2 + assert len(in_size) < 6 + + # Compute the output tensor size + output_size = ( + get_conv1d_output_size( + in_size, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size[0], + True, + ) + if len(in_size) == 3 + else get_conv2d_output_size( + in_size, out_channels, stride, padding, dilation, kernel_size, True + ) + ) + + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_ncl") +def quantized_conv1d_ncl_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: torch.Tensor, + bias_scale: torch.Tensor, + output_scale: float, + output_zero_point: int, + out_multiplier: torch.Tensor, + out_shift: torch.Tensor, +) -> torch.Tensor: + out_channels, _, *kernel_size = weight.shape + + in_size = input.shape + # Assert that the input tensor has at least 3 dimensions, and at most 6 + assert len(in_size) > 2 + assert len(in_size) < 6 + + # Compute the output tensor size + output_size = ( + get_conv1d_output_size( + in_size, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size[0], + False, + ) + if len(in_size) == 3 + else get_conv2d_output_size( + in_size, out_channels, stride, padding, dilation, kernel_size, False + ) + ) + + return input.new_empty(output_size, dtype=input.dtype) + + @register_fake("cadence::quantized_conv2d_nchw") def quantized_conv2d_nchw_meta( input: torch.Tensor, @@ -2371,6 +2483,68 @@ def roi_align_box_processor_meta( return rois.new_empty((rois.shape[0], 80), dtype=torch.uint8) +@register_fake("cadence::quantized_conv1d_ncl.per_tensor") +def quantized_conv1d_ncl_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + assert input.dim() == 3 and weight.dim() == 3 + out_channels, _, kernel_size = weight.shape + output_size = get_conv1d_output_size( + input.shape, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size, + False, + ) + return input.new_empty(output_size, dtype=input.dtype) + + +@register_fake("cadence::quantized_conv1d_nlc.per_tensor") +def quantized_conv1d_nlc_per_tensor_meta( + input: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: Tuple[int], + padding: Tuple[int], + dilation: Tuple[int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + assert input.dim() == 3 and weight.dim() == 3 + out_channels, _, kernel_size = weight.shape + output_size = get_conv1d_output_size( + input.shape, + out_channels, + stride[1], + padding[1], + dilation[1], + kernel_size, + True, + ) + return input.new_empty(output_size, dtype=input.dtype) + + @register_fake("cadence::quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor") def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_meta( input: torch.Tensor, diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py index 9f67204fcf9..2a1d155e575 100644 --- a/backends/cadence/aot/quantizer/patterns.py +++ b/backends/cadence/aot/quantizer/patterns.py @@ -247,7 +247,7 @@ def get_anchors( ) def replacement_op(self) -> OpOverload: - return torch.ops.cadence.quantized_conv2d_nchw.default + return torch.ops.cadence.quantized_conv1d_ncl.default class Conv2dPattern(QuantizationPattern): @@ -459,29 +459,35 @@ def get_anchors( output=[(relu_node,)], # Output is from the relu node ) - def replacement_op(self) -> OpOverload: - return torch.ops.cadence.quantized_conv2d_nchw.default - # Conv1d + regular relu op fusion class Conv1dReluPattern0(ConvReluBasePattern): def partition_types(self) -> List[OpOverload]: return [torch.ops.aten.conv1d.default, torch.ops.aten.relu.default] + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_conv1d_ncl.default # Conv1d + alternate relu op fusion class Conv1dReluPattern1(ConvReluBasePattern): def partition_types(self) -> List[OpOverload]: return [torch.ops.aten.conv1d.default, torch.ops.aten.relu_.default] + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_conv1d_ncl.default # Conv2d + regular relu op fusion class Conv2dReluPattern0(ConvReluBasePattern): def partition_types(self) -> List[OpOverload]: return [torch.ops.aten.conv2d.default, torch.ops.aten.relu.default] + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_conv2d_nchw.default # Conv2d + alternate relu op fusion class Conv2dReluPattern1(ConvReluBasePattern): def partition_types(self) -> List[OpOverload]: return [torch.ops.aten.conv2d.default, torch.ops.aten.relu_.default] + + def replacement_op(self) -> OpOverload: + return torch.ops.cadence.quantized_conv2d_nchw.default diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py index 781f04ae1da..ab2911c7845 100644 --- a/backends/cadence/aot/ref_implementations.py +++ b/backends/cadence/aot/ref_implementations.py @@ -627,6 +627,116 @@ def quantized_conv_per_tensor( ) +@impl(m, "quantized_conv1d_nlc.per_tensor") +def quantized_conv1d_nlc_per_tensor( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int, int], + padding: tuple[int, int], + dilation: tuple[int, int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + """ + Quantized convolution operation. + + Args: + - input_tensor (Tensor): The activations tensor + - weight (Tensor): The weight tensor + - bias (Tensor): The bias tensor + - stride (Tuple[int]): The stride of the convolution + - padding (Tuple[int]): The padding of the convolution + - dilation (Tuple[int]): The dilation of the convolution + - groups (int): The number of groups + - in_zero_point (int): The quantized mapping of zero for the input + - weight_zero_point (int): The quantized mapping of zero for the weight + - bias_scale (float): The quantized bias scale + - output_scale (float): The scale of the output + - output_zero_point (int): The zero point of the output + - out_multiplier (int): Unused + - out_shift (int): Unused + """ + return quantized_conv_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + +@impl(m, "quantized_conv1d_ncl.per_tensor") +def quantized_conv1d_ncl_per_tensor( + input_tensor: torch.Tensor, + weight: torch.Tensor, + bias: torch.Tensor, + stride: tuple[int, int], + padding: tuple[int, int], + dilation: tuple[int, int], + groups: int, + in_zero_point: int, + weight_zero_point: int, + bias_scale: float, + output_scale: float, + output_zero_point: int, + out_multiplier: int, + out_shift: int, +) -> torch.Tensor: + """ + Quantized convolution operation. + + Args: + - input_tensor (Tensor): The activations tensor + - weight (Tensor): The weight tensor + - bias (Tensor): The bias tensor + - stride (Tuple[int]): The stride of the convolution + - padding (Tuple[int]): The padding of the convolution + - dilation (Tuple[int]): The dilation of the convolution + - groups (int): The number of groups + - in_zero_point (int): The quantized mapping of zero for the input + - weight_zero_point (int): The quantized mapping of zero for the weight + - bias_scale (float): The quantized bias scale + - output_scale (float): The scale of the output + - output_zero_point (int): The zero point of the output + - out_multiplier (int): Unused + - out_shift (int): Unused + """ + if not input_tensor.is_contiguous(memory_format=torch.contiguous_format): + raise ValueError("Input tensor must be in NCL format") + return quantized_conv_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + + @impl(m, "quantized_conv2d_nchw.per_tensor") def quantized_conv2d_nchw_per_tensor( input_tensor: torch.Tensor, @@ -803,6 +913,40 @@ def variant( # Call the appropriate base function match layout: + case "ncl": + return quantized_conv1d_ncl_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) + case "nlc": + return quantized_conv1d_nlc_per_tensor( + input_tensor, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out_multiplier, + out_shift, + ) case "nchw": return quantized_conv2d_nchw_per_tensor( input_tensor, @@ -914,22 +1058,22 @@ def quantized_conv2d_nhwc_depthwise_asym8uxsym8u_asym8u_per_tensor() -> ( @impl(m, "quantized_conv1d_ncl_asym8sxsym8s_asym8s.per_tensor") -@quantized_conv_variant("nchw", torch.int8, torch.int8, is_1d=True) +@quantized_conv_variant("ncl", torch.int8, torch.int8, is_1d=True) def quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... @impl(m, "quantized_conv1d_ncl_asym8uxsym8u_asym8u.per_tensor") -@quantized_conv_variant("nchw", torch.uint8, torch.uint8, is_1d=True) +@quantized_conv_variant("ncl", torch.uint8, torch.uint8, is_1d=True) def quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... @impl(m, "quantized_conv1d_nlc_asym8sxsym8s_asym8s.per_tensor") -@quantized_conv_variant("nhwc", torch.int8, torch.int8, is_1d=True) +@quantized_conv_variant("nlc", torch.int8, torch.int8, is_1d=True) def quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor() -> torch.Tensor: ... @impl(m, "quantized_conv1d_nlc_asym8uxsym8u_asym8u.per_tensor") -@quantized_conv_variant("nhwc", torch.uint8, torch.uint8, is_1d=True) +@quantized_conv_variant("nlc", torch.uint8, torch.uint8, is_1d=True) def quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor() -> torch.Tensor: ... diff --git a/backends/cadence/aot/replace_ops.py b/backends/cadence/aot/replace_ops.py index bf0657315a3..a92c99c35c5 100644 --- a/backends/cadence/aot/replace_ops.py +++ b/backends/cadence/aot/replace_ops.py @@ -787,6 +787,8 @@ class ReplaceTrivialConvWithLinear(ExportPass): trivial_conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = { exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default, + exir_ops.edge.cadence.quantized_conv1d_ncl.default: exir_ops.edge.cadence.quantized_linear.default, + exir_ops.edge.cadence.quantized_conv1d_nlc.default: exir_ops.edge.cadence.quantized_linear.default, exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default, exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default, } @@ -799,10 +801,7 @@ def call_operator(self, op, args, kwargs, meta): # and quantized_conv have the same first 8 args. The quantized op has # extra args holding at least the zero point and scale of input, weight, bias, # and output tensor. - quantized_op = ( - op == exir_ops.edge.cadence.quantized_conv2d_nchw.default - or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default - ) + quantized_op = op != exir_ops.edge.cadence.convolution.default assert (len(args) == 8 and not quantized_op) or ( len(args) >= 12 and quantized_op ), "Inconsistent args for convolution" @@ -979,11 +978,12 @@ def call_operator( ) -> ProxyValue: if op not in { exir_ops.edge.cadence.convolution.default, + exir_ops.edge.cadence.quantized_conv1d_ncl.default, exir_ops.edge.cadence.quantized_conv2d_nchw.default, }: return super().call_operator(op, args, kwargs, meta) - quantized_op = op == exir_ops.edge.cadence.quantized_conv2d_nchw.default + quantized_op = op != exir_ops.edge.cadence.convolution.default if not quantized_op and len(args) == 8 and args[-1] is True: # Already in NHWC layout. @@ -1067,6 +1067,8 @@ class ReplaceConvWithIm2RowAndLinear(ExportPass): # decompose to. conv_op_to_linear_op: Dict[EdgeOpOverload, EdgeOpOverload] = { exir_ops.edge.cadence.convolution.default: exir_ops.edge.aten.linear.default, + exir_ops.edge.cadence.quantized_conv1d_ncl.default: exir_ops.edge.cadence.quantized_linear.default, + exir_ops.edge.cadence.quantized_conv1d_nlc.default: exir_ops.edge.cadence.quantized_linear.default, exir_ops.edge.cadence.quantized_conv2d_nchw.default: exir_ops.edge.cadence.quantized_linear.default, exir_ops.edge.cadence.quantized_conv2d_nhwc.default: exir_ops.edge.cadence.quantized_linear.default, } @@ -1076,10 +1078,7 @@ def call_operator(self, op, args, kwargs, meta): return super().call_operator(op, args, kwargs, meta) # Get the relevant args from convolution node. - quantized_op = ( - op == exir_ops.edge.cadence.quantized_conv2d_nchw.default - or op == exir_ops.edge.cadence.quantized_conv2d_nhwc.default - ) + quantized_op = op != exir_ops.edge.cadence.convolution.default assert (len(args) == 8 and not quantized_op) or ( len(args) >= 12 and quantized_op ), "Inconsistent args for convolution" @@ -1622,6 +1621,14 @@ class ReplaceSingleElementTensorArgumentsFromFullOpWithScalarPass(ExportPass): exir_ops.edge.cadence.quantized_add.per_tensor, [1, 2, 4, 5], ), + exir_ops.edge.cadence.quantized_conv1d_ncl: ( + exir_ops.edge.cadence.quantized_conv1d_ncl.per_tensor, + [8, 9, 12, 13], + ), + exir_ops.edge.cadence.quantized_conv1d_nlc: ( + exir_ops.edge.cadence.quantized_conv1d_nlc.per_tensor, + [8, 9, 12, 13], + ), exir_ops.edge.cadence.quantized_conv2d_nchw: ( exir_ops.edge.cadence.quantized_conv2d_nchw.per_tensor, [8, 9, 12, 13], diff --git a/backends/cadence/generic/operators/quantized_conv1d_ncl_out.cpp b/backends/cadence/generic/operators/quantized_conv1d_ncl_out.cpp new file mode 100644 index 00000000000..910e26db606 --- /dev/null +++ b/backends/cadence/generic/operators/quantized_conv1d_ncl_out.cpp @@ -0,0 +1,368 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace generic { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +// This implements a generic 2d conv kernel that operates on raw pointers. +// The version handles both quantized and fp32 convolutions. +// The input is of shape [n x c x h x w] +// The weight is of shape [oc x wc x wh x ww], where wc == c +// The output is of shape [n x oc x oh x ow] +// The bias is of shape [oc] +template < + typename IT = float, + typename WT = IT, + typename BT = IT, + typename OT = IT, + bool quantized = false> +__attribute__((noinline)) void conv2d_nchw_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t c, + int32_t h, + int32_t w, + int32_t oc, + int32_t wc, + int32_t wh, + int32_t ww, + int32_t oh, + int32_t ow, + // Stride + int16_t s0, + int16_t s1, + // Padding + int16_t p0, + int16_t p1, + // Dilation + int16_t d0, + int16_t d1, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + int32_t weight_zero_point = 0, + float bias_scale = 1, + float out_scale = 1, + OT out_zero_point = 0) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * c * h * w; + OT* out_batch = p_out + _n * oc * oh * ow; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + OT* out_plane = out_batch + _oc * oh * ow; + const WT* weight_batch = p_weight + _oc * wc * wh * ww; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of size + // icpg x h x w, with a stencil of size icpg x wh x ww, to compute an + // output channel of size 1 x oh x ow. + for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) { + for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) { + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to the + // output channel being computed) with the corresponding weight + // channel. + // If the padding is 0, and dilation is 1, then we can remove the + // unnecessary checks, and simplify the code so that it can be + // vectorized by Tensilica compiler. + if (zero_pad_unit_dilation) { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + int ioff = (_h + _wh) * w + (_w + _ww); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } else { + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + const IT* in_plane = in_batch + _ic * h * w; + const WT* weight_plane = weight_batch + (_ic - sic) * wh * ww; + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_h + d0 * _wh - p0) >= 0) && + ((_h + d0 * _wh - p0) < h) && + ((_w + d1 * _ww - p1) >= 0) && + ((_w + d1 * _ww - p1) < w)) { + int ioff = + (_h + d0 * _wh - p0) * w + (_w + d1 * _ww - p1); + int woff = _wh * ww + _ww; + float lhs = in_plane[ioff] - in_zero_point; + float rhs = weight_plane[woff] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } + } + if (quantized) { + float val = bias_scale * acc; + out_plane[_oh * ow + _ow] = + ::impl::generic::kernels::quantize( + val, inv_out_scale, out_zero_point); + } else { + out_plane[_oh * ow + _ow] = acc; + } + } + } + } + } + } +} + +// The quantized convolution kernel. in_scale and weight_scale are implicit in +// bias_scale, since it is a product of the two. The kernel will branch to +// quantized::conv1d or quantized::conv2d based on the dimensionality of +// activation tensor. +void quantized_conv1d_ncl( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int16_t groups, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + bool conv1d = input.dim() == 3; + // input = [n, c, h, w] + const int n = input.size(0); + const int c = input.size(1); + const int h = conv1d ? 1 : input.size(2); + const int w = conv1d ? input.size(2) : input.size(3); + // weight = [oc, wc, wh, ww] + const int oc = weight.size(0); + const int wc = weight.size(1); + const int wh = conv1d ? 1 : weight.size(2); + const int ww = conv1d ? weight.size(2) : weight.size(3); + // output = [n, oc, oh, ow] + const int oh = conv1d ? 1 : out.size(2); + const int ow = conv1d ? out.size(2) : out.size(3); + +#define typed_quantized_conv1d_ncl(ctype, dtype) \ + case ScalarType::dtype: { \ + conv2d_nchw_core_generic( \ + input.const_data_ptr(), \ + weight.const_data_ptr(), \ + bias.const_data_ptr(), \ + out.mutable_data_ptr(), \ + n, \ + c, \ + h, \ + w, \ + oc, \ + wc, \ + wh, \ + ww, \ + oh, \ + ow, \ + stride[0], \ + stride[1], \ + padding[0], \ + padding[1], \ + dilation[0], \ + dilation[1], \ + groups, \ + in_zero_point, \ + weight_zero_point, \ + bias_scale, \ + output_scale, \ + (ctype)output_zero_point); \ + break; \ + } + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_ncl); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_conv1d_ncl +} + +void quantized_conv1d_ncl_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED const Tensor& out_multiplier, + __ET_UNUSED const Tensor& out_shift, + Tensor& out) { + const float bias_scale_float = bias_scale.const_data_ptr()[0]; + const int32_t weight_zero_point_int = + weight_zero_point.const_data_ptr()[0]; + quantized_conv1d_ncl( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point_int, + bias_scale_float, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_ncl_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv1d_ncl( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv1d_ncl( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv1d_ncl( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +} // namespace native +} // namespace generic +} // namespace impl diff --git a/backends/cadence/generic/operators/quantized_conv1d_nlc_out.cpp b/backends/cadence/generic/operators/quantized_conv1d_nlc_out.cpp new file mode 100644 index 00000000000..742d9173998 --- /dev/null +++ b/backends/cadence/generic/operators/quantized_conv1d_nlc_out.cpp @@ -0,0 +1,355 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +namespace impl { +namespace generic { +namespace native { + +using ::executorch::aten::IntArrayRef; +using ::executorch::aten::ScalarType; +using ::executorch::aten::Tensor; +using ::executorch::runtime::KernelRuntimeContext; + +template < + typename IT = float, + typename WT = IT, + typename BT = IT, + typename OT = IT, + bool quantized = false> +__attribute__((noinline)) void conv2d_nhwc_core_generic( + // All the arrays + const IT* __restrict__ p_in, + const WT* __restrict__ p_weight, + const BT* __restrict__ p_bias, + OT* __restrict__ p_out, + // The array sizes + int32_t n, + int32_t h, + int32_t w, + int32_t c, + int32_t oc, + int32_t wh, + int32_t ww, + int32_t wc, + int32_t oh, + int32_t ow, + // Stride + int16_t s0, + int16_t s1, + // Padding + int16_t p0, + int16_t p1, + // Dilation + int16_t d0, + int16_t d1, + // Group for depthwise conv + int16_t groups, + // Optional args that are only relevant for quantized convolution + // input zero point + IT in_zero_point = 0, + // weight zero point + int32_t weight_zero_point = 0, + float bias_scale = 1, + float out_scale = 1, + OT out_zero_point = 0) { + float inv_out_scale = 1. / out_scale; + bool zero_pad_unit_dilation = d0 == 1 && d1 == 1 && p0 == 0 && p1 == 0; + + // Compute the number of in and out channels per group + const int ocpg = oc / groups; + const int icpg = c / groups; + + // Iterate over all the output batches (i.e., n) + for (int _n = 0; _n < n; ++_n) { + const IT* in_batch = p_in + _n * h * w * c; + OT* out_batch = p_out + _n * oh * ow * oc; + for (int _h = 0, _oh = 0; _oh < oh; _h += s0, ++_oh) { + for (int _w = 0, _ow = 0; _ow < ow; _w += s1, ++_ow) { + OT* out_line = out_batch + (_oh * ow + _ow) * oc; + // Compute separable convolution for each group + for (int _g = 0; _g < groups; ++_g) { + // Identify the input and output channels involved in the computation + // of this group + int sic = _g * icpg; + int soc = _g * ocpg; + // Populate all the output channels in the group + for (int _oc = soc; _oc < soc + ocpg; ++_oc) { + const WT* weight_batch = p_weight + _oc * wh * ww * wc; + // We compute one output channel at a time. The computation can be + // thought of as a stencil computation: we iterate over an input of + // size h x w x icpg, with a stencil of size wh x ww x icpg, to + // compute an output channel of size oh x ow x 1. + float acc = p_bias[_oc]; + // Below is the stencil computation that performs the hadamard + // product+accumulation of each input channel (contributing to + // the output channel being computed) with the corresponding + // weight channel. If the padding is 0, and dilation is 1, then + // we can remove the unnecessary checks, and simplify the code + // so that it can be vectorized by Tensilica compiler.x`` + if (zero_pad_unit_dilation) { + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + const IT* in_line = + in_batch + (_h + _wh) * w * c + (_w + _ww) * c; + const WT* weight_line = + weight_batch + _wh * ww * wc + _ww * wc; + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + float lhs = in_line[_ic] - in_zero_point; + float rhs = weight_line[_ic - sic] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } else { + for (int _wh = 0; _wh < wh; ++_wh) { + for (int _ww = 0; _ww < ww; ++_ww) { + if (((_h + d0 * _wh - p0) >= 0) && + ((_h + d0 * _wh - p0) < h) && + ((_w + d1 * _ww - p1) >= 0) && + ((_w + d1 * _ww - p1 < w))) { + const IT* in_line = in_batch + + (_h + d0 * _wh - p0) * w * c + (_w + d1 * _ww - p1) * c; + const WT* weight_line = + weight_batch + _wh * ww * wc + _ww * wc; + for (int _ic = sic; _ic < sic + icpg; ++_ic) { + float lhs = in_line[_ic] - in_zero_point; + float rhs = weight_line[_ic - sic] - + (quantized ? weight_zero_point : 0); + acc += lhs * rhs; + } + } + } + } + } + if (quantized) { + float val = bias_scale * acc; + out_line[_oc] = ::impl::generic::kernels::quantize( + val, inv_out_scale, out_zero_point); + } else { + out_line[_oc] = acc; + } + } + } + } + } + } +} + +void quantized_conv1d_nlc( + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int16_t groups, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + bool conv1d = input.dim() == 3; + // input = [n, h, w, c] + const int n = input.size(0); + const int h = conv1d ? 1 : input.size(1); + const int w = conv1d ? input.size(1) : input.size(2); + const int c = conv1d ? input.size(2) : input.size(3); + // weight = [oc, wh, ww, wc] + const int oc = weight.size(0); + const int wh = conv1d ? 1 : weight.size(1); + const int ww = conv1d ? weight.size(1) : weight.size(2); + const int wc = conv1d ? weight.size(2) : weight.size(3); + // output = [n, oh, ow, oc] + const int oh = conv1d ? 1 : out.size(1); + const int ow = conv1d ? out.size(1) : out.size(2); + +#define typed_quantized_conv1d_nlc(ctype, dtype) \ + case ScalarType::dtype: { \ + conv2d_nhwc_core_generic( \ + input.const_data_ptr(), \ + weight.const_data_ptr(), \ + bias.const_data_ptr(), \ + out.mutable_data_ptr(), \ + n, \ + h, \ + w, \ + c, \ + oc, \ + wh, \ + ww, \ + wc, \ + oh, \ + ow, \ + stride[0], \ + stride[1], \ + padding[0], \ + padding[1], \ + dilation[0], \ + dilation[1], \ + groups, \ + in_zero_point, \ + weight_zero_point, \ + bias_scale, \ + output_scale, \ + (ctype)output_zero_point); \ + break; \ + } + ScalarType dtype = out.scalar_type(); + switch (dtype) { + ET_FORALL_CADENCE_QUANTIZED_TYPES(typed_quantized_conv1d_nlc); + default: + ET_DCHECK_MSG( + false, "Unhandled dtype %s", torch::executor::toString(dtype)); + } + +#undef typed_quantized_conv1d_nlc +} + +void quantized_conv1d_nlc_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + const Tensor& weight_zero_point, + const Tensor& bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED const Tensor& out_multiplier, + __ET_UNUSED const Tensor& out_shift, + Tensor& out) { + const float bias_scale_float = bias_scale.const_data_ptr()[0]; + const int32_t weight_zero_point_int = + weight_zero_point.const_data_ptr()[0]; + quantized_conv1d_nlc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point_int, + bias_scale_float, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_nlc_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv1d_nlc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv1d_nlc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( + __ET_UNUSED KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + quantized_conv1d_nlc( + input, + weight, + bias, + stride, + padding, + dilation, + groups, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); +} + +} // namespace native +} // namespace generic +} // namespace impl diff --git a/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp index fbb01c82e65..5d00cda17ba 100644 --- a/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp +++ b/backends/cadence/generic/operators/quantized_conv2d_nchw_out.cpp @@ -496,72 +496,6 @@ void quantized_conv2d_nchw_depthwise_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out( - __ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - __ET_UNUSED int64_t out_multiplier, - __ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv2d_nchw( - input, - weight, - bias, - stride, - padding, - dilation, - groups, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); -} - -void quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out( - __ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - __ET_UNUSED int64_t out_multiplier, - __ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv2d_nchw( - input, - weight, - bias, - stride, - padding, - dilation, - groups, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); -} - } // namespace native } // namespace generic } // namespace impl diff --git a/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp index eca836dcc94..fce8dff61e4 100644 --- a/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp +++ b/backends/cadence/generic/operators/quantized_conv2d_nhwc_out.cpp @@ -417,72 +417,6 @@ void quantized_conv2d_nhwc_dilated_asym8uxsym8u_asym8u_per_tensor_out( out); } -void quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out( - __ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - __ET_UNUSED int64_t out_multiplier, - __ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv2d_nhwc( - input, - weight, - bias, - stride, - padding, - dilation, - groups, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); -} - -void quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out( - __ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& input, - const Tensor& weight, - const Tensor& bias, - IntArrayRef stride, - IntArrayRef padding, - IntArrayRef dilation, - int64_t groups, - int64_t in_zero_point, - int64_t weight_zero_point, - double bias_scale, - double output_scale, - int64_t output_zero_point, - __ET_UNUSED int64_t out_multiplier, - __ET_UNUSED int64_t out_shift, - Tensor& out) { - quantized_conv2d_nhwc( - input, - weight, - bias, - stride, - padding, - dilation, - groups, - in_zero_point, - weight_zero_point, - bias_scale, - output_scale, - output_zero_point, - out); -} - void quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out( __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& input, diff --git a/backends/cadence/generic/operators/targets.bzl b/backends/cadence/generic/operators/targets.bzl index fa0f128b229..3dc2483979e 100644 --- a/backends/cadence/generic/operators/targets.bzl +++ b/backends/cadence/generic/operators/targets.bzl @@ -78,6 +78,36 @@ def define_common_targets(): ], ) + runtime.cxx_library( + name = "quantized_conv1d_ncl_out", + srcs = ["quantized_conv1d_ncl_out.cpp"], + exported_headers = ["operators.h"], + platforms = CXX, + deps = [ + "//executorch/runtime/kernel:kernel_includes", + "//executorch/backends/cadence/generic/kernels:cadence_kernels", + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + + runtime.cxx_library( + name = "quantized_conv1d_nlc_out", + srcs = ["quantized_conv1d_nlc_out.cpp"], + exported_headers = ["operators.h"], + platforms = CXX, + deps = [ + "//executorch/runtime/kernel:kernel_includes", + "//executorch/backends/cadence/generic/kernels:cadence_kernels", + ], + visibility = [ + "//executorch/backends/cadence/...", + "@EXECUTORCH_CLIENTS", + ], + ) + runtime.cxx_library( name = "quantized_conv2d_nchw_out", srcs = ["quantized_conv2d_nchw_out.cpp"], diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_per_tensor_out.cpp new file mode 100644 index 00000000000..eadd2ec92c3 --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_ncl_per_tensor_out.cpp @@ -0,0 +1,322 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using Tensor = executorch::aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = executorch::aten::ScalarType; +using ::executorch::aten::IntArrayRef; + +namespace impl { +namespace HiFi { +namespace native { + +// Optimized NCHW 1D convolution for int8 x int8 -> int8 +void xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + constexpr int kNnlibMaxDim = 3; + + WORD8* __restrict__ p_out = + (WORD8* __restrict__)out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = + (WORD8* __restrict__)input.const_data_ptr(); + WORD8* __restrict__ p_kernel = + (WORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_channels = weight.size(1); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + WORD32 out_shift32 = 0; + WORD32 kernel_zero_bias = -weight_zero_point; + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 1; + WORD8* ptr1 = (WORD8*)kernels::allocate_temp_memory( + ctx, ((batches * input_channels * input_width) + 8) * sizeof(WORD8)); + WORD8* ptr2 = (WORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(WORD8)); + WORD8* pin = (WORD8*)ALIGN_PTR(ptr1, 8); + WORD8* pkernel = (WORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = batches; + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = batches; + p_out_shape[1] = input_width; + p_out_shape[2] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1}; + + xa_nn_transpose_8_8( + pin, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_width; + p_out_shape1[2] = kernel_channels; + + xa_nn_transpose_8_8( + pkernel, + p_out_shape1, + p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + WORD8* in_batch = pin + _n * input_channels * input_width; + WORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8xasym8( + (UWORD8*)out_batch, + (UWORD8*)in_batch, + (UWORD8*)pkernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +// Optimized NCHW 1D convolution for uint8 x uint8 -> uint8 +void xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + constexpr int kNnlibMaxDim = 3; + + UWORD8* __restrict__ p_out = + (UWORD8* __restrict__)out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = + (UWORD8* __restrict__)input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = + (UWORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_channels = weight.size(1); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + WORD32 out_shift32 = 0; + WORD32 kernel_zero_bias = -weight_zero_point; + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 1; + UWORD8* ptr1 = (UWORD8*)kernels::allocate_temp_memory( + ctx, ((batches * input_channels * input_width) + 8) * sizeof(UWORD8)); + UWORD8* ptr2 = (UWORD8*)kernels::allocate_temp_memory( + ctx, + ((out_channels * kernel_channels * kernel_width) + 8) * sizeof(UWORD8)); + UWORD8* pin = (UWORD8*)ALIGN_PTR(ptr1, 8); + UWORD8* pkernel = (UWORD8*)ALIGN_PTR(ptr2, 8); + + WORD32 p_inp_shape[kNnlibMaxDim]; + p_inp_shape[0] = batches; + p_inp_shape[1] = input_channels; + p_inp_shape[2] = input_width; + + WORD32 p_out_shape[kNnlibMaxDim]; + p_out_shape[0] = batches; + p_out_shape[1] = input_width; + p_out_shape[2] = input_channels; + + WORD32 p_permute_vec[kNnlibMaxDim] = {0, 2, 1}; + + xa_nn_transpose_8_8( + (WORD8*)pin, + p_out_shape, + (WORD8*)p_inp, + p_inp_shape, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 p_inp_shape1[kNnlibMaxDim]; + p_inp_shape1[0] = out_channels; + p_inp_shape1[1] = kernel_channels; + p_inp_shape1[2] = kernel_width; + + WORD32 p_out_shape1[kNnlibMaxDim]; + p_out_shape1[0] = out_channels; + p_out_shape1[1] = kernel_width; + p_out_shape1[2] = kernel_channels; + + xa_nn_transpose_8_8( + (WORD8*)pkernel, + p_out_shape1, + (WORD8*)p_kernel, + p_inp_shape1, + p_permute_vec, + kNnlibMaxDim, + kNnlibMaxDim); + + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = pin + _n * input_channels * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8uxasym8u( + out_batch, + in_batch, + pkernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +void quantized_conv1d_ncl_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + __ET_UNUSED int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + if (input.scalar_type() == ScalarType::Byte) { + xa_opt_quantized_conv1d_ncl_asym8uxsym8u_asym8u( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); + } else if (input.scalar_type() == ScalarType::Char) { + xa_opt_quantized_conv1d_ncl_asym8sxsym8s_asym8s( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); + } else { + ET_CHECK_MSG(false, "Unhandled input type %hhd", input.scalar_type()); + } +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_per_tensor_out.cpp b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_per_tensor_out.cpp new file mode 100644 index 00000000000..8cbccef773e --- /dev/null +++ b/backends/cadence/hifi/operators/op_quantized_conv1d_nlc_per_tensor_out.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x)) + (bytes - 1)) & (~(bytes - 1))) + +using Tensor = executorch::aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = executorch::aten::ScalarType; +using ::executorch::aten::IntArrayRef; + +namespace impl { +namespace HiFi { +namespace native { + +// Optimized NHWC 1D convolution for int8 x int8 -> int8 +void xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + WORD8* __restrict__ p_out = + (WORD8* __restrict__)out.mutable_data_ptr(); + WORD8* __restrict__ p_inp = + (WORD8* __restrict__)input.const_data_ptr(); + WORD8* __restrict__ p_kernel = + (WORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + WORD32 out_shift32 = 0; + WORD32 kernel_zero_bias = -weight_zero_point; + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 0; + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)::impl::HiFi::kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + WORD8* in_batch = p_inp + _n * input_channels * input_width; + WORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8xasym8( + (UWORD8*)out_batch, + (UWORD8*)in_batch, + (UWORD8*)p_kernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +// Optimized NHWC 1D convolution for uint8 x uint8 -> uint8 +void xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + int32_t in_zero_point, + int32_t weight_zero_point, + float bias_scale, + float output_scale, + int32_t output_zero_point, + Tensor& out) { + UWORD8* __restrict__ p_out = + (UWORD8* __restrict__)out.mutable_data_ptr(); + UWORD8* __restrict__ p_inp = + (UWORD8* __restrict__)input.const_data_ptr(); + UWORD8* __restrict__ p_kernel = + (UWORD8* __restrict__)weight.const_data_ptr(); + WORD32* __restrict__ p_bias = + (WORD32* __restrict__)bias.const_data_ptr(); + + WORD32 batches = input.size(0); + WORD32 input_channels = input.size(1); + WORD32 input_width = input.size(2); + WORD32 out_channels = weight.size(0); + WORD32 kernel_width = weight.size(2); + WORD32 out_width = out.size(2); + WORD32 x_stride = stride[1]; + WORD32 x_padding = padding[1]; + WORD32 input_zero_bias = -in_zero_point; + WORD32 out_multiplier32 = bias_scale * (1. / output_scale) * 2147483648; + WORD32 out_shift32 = 0; + WORD32 kernel_zero_bias = -weight_zero_point; + + WORD32 out_zero_bias = output_zero_point; + WORD32 out_data_format = 0; + WORD32 scratch_size = + xa_nn_conv1d_std_getsize(kernel_width, input_width, input_channels, 8); + scratch_size = scratch_size < 0 ? 0 : scratch_size; + WORD32* ptr_scratch = + (WORD32*)kernels::allocate_temp_memory(ctx, scratch_size); + pVOID p_scratch = (pVOID)ALIGN_PTR(ptr_scratch, 8); + + for (int _n = 0; _n < batches; _n++) { + UWORD8* in_batch = p_inp + _n * input_channels * input_width; + UWORD8* out_batch = p_out + _n * out_channels * out_width; + + xa_nn_conv1d_std_asym8uxasym8u( + out_batch, + in_batch, + p_kernel, + p_bias, + 1, + input_width, + input_channels, + kernel_width, + out_channels, + x_stride, + x_padding, + out_width, + input_zero_bias, + kernel_zero_bias, + out_multiplier32, + out_shift32, + out_zero_bias, + out_data_format, + p_scratch); + } +} + +void quantized_conv1d_nlc_per_tensor_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& weight, + const Tensor& bias, + IntArrayRef stride, + IntArrayRef padding, + __ET_UNUSED IntArrayRef dilation, + __ET_UNUSED int64_t groups, + int64_t in_zero_point, + int64_t weight_zero_point, + double bias_scale, + double output_scale, + int64_t output_zero_point, + __ET_UNUSED int64_t out_multiplier, + __ET_UNUSED int64_t out_shift, + Tensor& out) { + if (input.scalar_type() == ScalarType::Byte) { + xa_opt_quantized_conv1d_nlc_asym8uxsym8u_asym8u( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); + } else if (input.scalar_type() == ScalarType::Char) { + xa_opt_quantized_conv1d_nlc_asym8sxsym8s_asym8s( + ctx, + input, + weight, + bias, + stride, + padding, + in_zero_point, + weight_zero_point, + bias_scale, + output_scale, + output_zero_point, + out); + } else { + ET_CHECK_MSG(false, "Unhandled input type %hhd", input.scalar_type()); + } +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/targets.bzl b/backends/cadence/hifi/operators/targets.bzl index 1f9814c4a4e..547d89fc5c2 100644 --- a/backends/cadence/hifi/operators/targets.bzl +++ b/backends/cadence/hifi/operators/targets.bzl @@ -67,6 +67,7 @@ OPERATORS = [ "quantized_conv2d_nchw_out", "quantized_conv2d_nchw_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nchw_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv1d_ncl_per_tensor_out", "quantized_conv1d_ncl_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv1d_ncl_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv2d_nchw_depthwise_asym8sxsym8s_asym8s_per_tensor_out", @@ -76,6 +77,7 @@ OPERATORS = [ "quantized_conv2d_nhwc_out", "quantized_conv2d_nhwc_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv2d_nhwc_asym8uxsym8u_asym8u_per_tensor_out", + "quantized_conv1d_nlc_per_tensor_out", "quantized_conv1d_nlc_asym8sxsym8s_asym8s_per_tensor_out", "quantized_conv1d_nlc_asym8uxsym8u_asym8u_per_tensor_out", "quantized_conv2d_nhwc_depthwise_asym8sxsym8s_asym8s_per_tensor_out",