pytorch
diff --git a/‎examples/qualcomm/custom_op/custom_ops_fast_gelu.py‎
Lines changed: 51 additions & 23 deletions b/‎examples/qualcomm/custom_op/custom_ops_fast_gelu.py‎
Lines changed: 51 additions & 23 deletions
diff --git a/‎examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile‎
Lines changed: 5 additions & 10 deletions b/‎examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/Makefile‎
Lines changed: 5 additions & 10 deletions
diff --git a/‎examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp‎
Lines changed: 165 additions & 12 deletions b/‎examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage/src/ops/FastGelu.cpp‎
Lines changed: 165 additions & 12 deletions
diff --git a/‎examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage_old/Makefile‎ b/‎examples/qualcomm/custom_op/fastgelu_op_package_htp/FastGeluOpPackage_old/Makefile‎
@@ -50,24 +50,47 @@ def fast_gelu_impl(x: torch.Tensor) -> torch.Tensor:
 
 
 # registering the out variant.
-my_op_lib.define(
-    "fast_gelu.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)"
-)  # should print 'fast_gelu.out'
-
-
-# ------------------------------------------------------------------------------
-# 2. Simple model using custom op
-# ------------------------------------------------------------------------------
+my_op_lib.define("fast_gelu.out(Tensor input, *, Tensor(a!) output) -> Tensor(a!)")
 
 
 class Model(torch.nn.Module):
     def forward(self, a):
         return torch.ops.my_ops.fast_gelu.default(a)
 
 
-# ------------------------------------------------------------------------------
-# 3. Build + register custom op package
-# ------------------------------------------------------------------------------
+def annotate_custom(gm: torch.fx.GraphModule) -> None:
+    """
+    This function is specific for custom op.
+    The source_fn of the rewritten nn module turns out to be "my_ops.fast_gelu.default"
+    """
+    from executorch.backends.qualcomm.quantizer.annotators import _is_annotated
+    from executorch.backends.qualcomm.quantizer.qconfig import (
+        get_ptq_per_channel_quant_config,
+    )
+    from torch.fx import Node
+    from torchao.quantization.pt2e.quantizer import QuantizationAnnotation
+    from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
+
+    quantization_config = get_ptq_per_channel_quant_config()
+    for node in gm.graph.nodes:
+        if node.target != torch.ops.my_ops.fast_gelu.default:
+            continue
+
+        # skip annotation if it is already annotated
+        if _is_annotated([node]):
+            continue
+
+        input_qspec_map = {}
+        input_act = node.args[0]
+        assert isinstance(input_act, Node)
+        input_spec = quantization_config.input_activation
+        input_qspec_map[input_act] = input_spec
+
+        node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
+            input_qspec_map=input_qspec_map,
+            output_qspec=quantization_config.output_activation,
+            _annotated=True,
+        )
 
 
 def _run(cmd, cwd=None):
@@ -135,11 +158,6 @@ def prepare_op_package(
     return op_package_options, op_package_paths
 
 
-# ------------------------------------------------------------------------------
-# 4. Entrypoint — same pattern as custom_ops_1.py
-# ------------------------------------------------------------------------------
-
-
 def main(args):
     if args.build_op_package:
         if "HEXAGON_SDK_ROOT" not in os.environ:
@@ -158,7 +176,7 @@ def main(args):
         quant_dtype = None
 
     instance = Model()
-    sample_input = (torch.randn(1, 128),)
+    sample_input = (torch.randn(1, 16384),)
     pte_filename = "fastgelu_model"
     workspace = f"/data/local/tmp/executorch/{pte_filename}"
     soc_info: SocInfo = _soc_info_table[getattr(QcomChipset, args.model)]
@@ -169,9 +187,14 @@ def main(args):
         soc_info.htp_info.htp_arch,
         args.build_op_package,
     )
-    # quantizer = make_quantizer(
-    #     quant_dtype=quant_dtype, custom_annotations=(annotate_custom,)
-    # )
+    quant_dtype: Literal[QuantDtype.use_16a16w] = QuantDtype.use_8a8w
+    if args.use_fp16:
+        quant_dtype = None
+    quantizer = None
+    if not args.use_fp16:
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype, custom_annotations=(annotate_custom,)
+        )
 
     build_executorch_binary(
         instance,
@@ -180,8 +203,8 @@ def main(args):
         f"{args.artifact}/{pte_filename}",
         sample_input,
         op_package_options=op_package_options,
-        # quant_dtype=quant_dtype,
-        # custom_quantizer=quantizer,
+        quant_dtype=quant_dtype,
+        custom_quantizer=quantizer,
     )
 
     if args.compile_only:
@@ -203,6 +226,7 @@ def main(args):
     adb.pull(output_path=args.artifact)
 
     # Compare results
+    model = Model()
     x86_golden = model(*sample_input)
     import numpy as np
 
@@ -211,10 +235,14 @@ def main(args):
             os.path.join(output_data_folder, "output_0_0.raw"), dtype=np.float32
         )
     ).reshape(x86_golden.size())
+    result = torch.all(torch.isclose(x86_golden, device_output, atol=1e-2)).item()
     print(
         "is_close?",
-        torch.all(torch.isclose(x86_golden, device_output, atol=1e-2)).item(),
+        result,
     )
+    if not result:
+        print(f"x86_golden {x86_golden}")
+        print(f"device_out {device_output}")
 
 
 if __name__ == "__main__":
 
@@ -35,25 +35,20 @@ $(info "HEXAGON_SDK_ROOT is [${HEXAGON_SDK_ROOT}]")
 HEXAGON_SDK_ROOT_V68 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.2.0
 HEXAGON_SDK_ROOT_V69 := $(HEXAGON_SDK_BASE)/hexagon-sdk-4.3.0
 HEXAGON_SDK_ROOT_V73 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
-# HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
-# HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.0.0
-HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)
-HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_BASE)
+HEXAGON_SDK_ROOT_V75 := $(HEXAGON_SDK_BASE)/hexagon-sdk-5.4.0
+HEXAGON_SDK_ROOT_V79 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.0.0
 HEXAGON_SDK_ROOT_V81 := $(HEXAGON_SDK_BASE)/hexagon-sdk-6.2.0
 #Updated to point to latest sdk to match with libQnnHtp.so
-# HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT_V81)
-HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_BASE)
+HEXAGON_SDK_ROOT_X86 := $(HEXAGON_SDK_ROOT_V81)
 
 HEXAGON_TOOLS_VERSION_V68 := 8.4.09
 HEXAGON_TOOLS_VERSION_V69 := 8.5.03
 HEXAGON_TOOLS_VERSION_V73 := 8.6.02
 HEXAGON_TOOLS_VERSION_V75 := 8.7.03
-# HEXAGON_TOOLS_VERSION_V79 := 8.8.02
-HEXAGON_TOOLS_VERSION_V79 := 8.8.06
+HEXAGON_TOOLS_VERSION_V79 := 8.8.02
 HEXAGON_TOOLS_VERSION_V81 := 19.0.01
 #Updated to point to latest sdk to match with libQnnHtp.so
-# HEXAGON_TOOLS_VERSION_X86 := 19.0.01
-HEXAGON_TOOLS_VERSION_X86 := 8.8.06
+HEXAGON_TOOLS_VERSION_X86 := 19.0.01
 
 ifndef ANDROID_NDK_ROOT
 ifeq ($(MAKECMDGOALS),htp_aarch64)
 
@@ -2,6 +2,7 @@
 // Auto Generated Code for FastGeluOpPackage
 //==============================================================================
 
+#include <algorithm>
 #include <cmath>
 #include "HTP/core/constraints.h"
 #include "HTP/core/op_package_feature_support.h"
@@ -80,27 +81,179 @@ DEF_PACKAGE_OP((fastgeluImpl<Tensor>), "FastGelu")
 
 /* execute functions for ops */
 
+// template <typename TensorType>
+// GraphStatus fastgeluImpl(TensorType& y, const TensorType& x) {
+//   const uint32_t numElements = x.total_storage_elements();
+
+//   if (y.total_storage_elements() != numElements) {
+//     return GraphStatus::ErrorFatal;
+//   }
+
+//   const float kAlpha = 0.7978845608f; // sqrt(2/pi)
+//   const float kCoeff = 0.044715f;
+
+//   float* yData = reinterpret_cast<float*>(y.raw_data());
+//   const float* xData = reinterpret_cast<const float*>(x.raw_data_const());
+
+//   for (uint32_t i = 0; i < numElements; ++i) {
+//     const float v = xData[i];
+//     const float inner = kAlpha * (v + kCoeff * v * v * v);
+//     yData[i] = 0.5f * v * (1.0f + std::tanh(inner));
+//   }
+
+//   return GraphStatus::Success;
+// }
+
 template <typename TensorType>
 GraphStatus fastgeluImpl(TensorType& y, const TensorType& x) {
-  const uint32_t numElements = x.total_storage_elements();
+  const uint32_t N = x.total_storage_elements();
 
-  if (y.total_storage_elements() != numElements) {
+  if (y.total_storage_elements() != N) {
     return GraphStatus::ErrorFatal;
   }
 
-  const float kAlpha = 0.7978845608f; // sqrt(2/pi)
-  const float kCoeff = 0.044715f;
+  const auto in_info = x.get_dtype_intfc();
+  const auto out_info = y.get_dtype_intfc();
 
-  float* yData = reinterpret_cast<float*>(y.raw_data());
-  const float* xData = reinterpret_cast<const float*>(x.raw_data_const());
-
-  for (uint32_t i = 0; i < numElements; ++i) {
-    const float v = xData[i];
-    const float inner = kAlpha * (v + kCoeff * v * v * v);
-    yData[i] = 0.5f * v * (1.0f + std::tanh(inner));
+  if (in_info.dtype != DType::Float32 || in_info.dtype != DType::QUInt8) {
+    return GraphStatus::ErrorPrecision;
   }
+  if (in_info.dtype == DType::Float32 && out_info.dtype == DType::Float32) {
+    const float* xData = static_cast<const float*>(x.raw_data_const());
+    float* yData = static_cast<float*>(y.raw_data());
+
+    // --- Temporary FP16 buffers ---
+    std::vector<Float16> tmp_in(N);
+    std::vector<Float16> tmp_out(N);
+
+    for (uint32_t i = 0; i < N; ++i) {
+      tmp_in[i] = static_cast<Float16>(xData[i]);
+    }
+
+#ifdef __hexagon__
+    union {
+      Float16 f;
+      uint16_t b;
+    } kAlpha = {(Float16)0.7978845608f}; // sqrt(2/pi)
+    union {
+      Float16 f;
+      uint16_t b;
+    } kCoeff = {(Float16)0.044715f};
+    union {
+      Float16 f;
+      uint16_t b;
+    } kHalf = {(Float16)0.5f};
+    union {
+      Float16 f;
+      uint16_t b;
+    } kOne = {(Float16)1.0f};
+    union {
+      Float16 f;
+      uint16_t b;
+    } k27 = {(Float16)27.0f};
+    union {
+      Float16 f;
+      uint16_t b;
+    } kInv27 = {(Float16)(1.0f / 27.0f)};
+    union {
+      Float16 f;
+      uint16_t b;
+    } kOne3 = {(Float16)(1.0f / 3.0f)};
+    union {
+      Float16 f;
+      uint16_t b;
+    } kOne9 = {(Float16)(1.0f / 9.0f)};
+
+    HVX_Vector v_alpha = Q6_Vh_vsplat_R(kAlpha.b);
+    HVX_Vector v_coeff = Q6_Vh_vsplat_R(kCoeff.b);
+    HVX_Vector v_half = Q6_Vh_vsplat_R(kHalf.b);
+    HVX_Vector v_one = Q6_Vh_vsplat_R(kOne.b);
+    HVX_Vector v_27 = Q6_Vh_vsplat_R(k27.b);
+    HVX_Vector v_inv27 = Q6_Vh_vsplat_R(kInv27.b);
+    HVX_Vector v_1_3 = Q6_Vh_vsplat_R(kOne3.b);
+    HVX_Vector v_1_9 = Q6_Vh_vsplat_R(kOne9.b);
+
+    const int VBYTES = 128;
+    const int ELEMS = VBYTES / sizeof(Float16); // 64
 
-  return GraphStatus::Success;
+    for (uint32_t i = 0; i < N; i += ELEMS) {
+      HVX_Vector vx = q6op_V_vldu_A(&tmp_in[i]); // x
+      HVX_Vector vx2 = Q6_Vhf_vmpy_VhfVhf(vx, vx); // x^2
+      HVX_Vector vx3 = Q6_Vhf_vmpy_VhfVhf(vx2, vx); // x^3
+
+      // z = α * (x + c*x^3)
+      HVX_Vector vcx3 = Q6_Vhf_vmpy_VhfVhf(vx3, v_coeff);
+      HVX_Vector vsum = Q6_Vhf_vadd_VhfVhf(vx, vcx3);
+      HVX_Vector vz = Q6_Vhf_vmpy_VhfVhf(vsum, v_alpha);
+
+      // z^2, z^4
+      HVX_Vector vz2 = Q6_Vhf_vmpy_VhfVhf(vz, vz);
+      HVX_Vector vz4 = Q6_Vhf_vmpy_VhfVhf(vz2, vz2);
+
+      // inv_den ≈ (1/27) * (1 - (1/3) z^2 + (1/9) z^4)
+      HVX_Vector term1 = Q6_Vhf_vmpy_VhfVhf(vz2, v_1_3); // (1/3) z^2
+      HVX_Vector one_m_t = Q6_Vhf_vsub_VhfVhf(v_one, term1); // 1 - (1/3) z^2
+      HVX_Vector term2 = Q6_Vhf_vmpy_VhfVhf(vz4, v_1_9); // (1/9) z^4
+      HVX_Vector poly =
+          Q6_Vhf_vadd_VhfVhf(one_m_t, term2); // 1 - 1/3 z^2 + 1/9 z^4
+      HVX_Vector inv_den = Q6_Vhf_vmpy_VhfVhf(poly, v_inv27); // * (1/27)
+
+      // num = z * (27 + z^2) = 27z + z^3
+      HVX_Vector z3 = Q6_Vhf_vmpy_VhfVhf(vz2, vz);
+      HVX_Vector t27z = Q6_Vhf_vmpy_VhfVhf(vz, v_27);
+      HVX_Vector num = Q6_Vhf_vadd_VhfVhf(t27z, z3);
+
+      // tanh(z) ≈ num * inv_den
+      HVX_Vector vtanh = Q6_Vhf_vmpy_VhfVhf(num, inv_den);
+
+      // y = 0.5 * x * (1 + tanh)
+      HVX_Vector one_plus_tanh = Q6_Vhf_vadd_VhfVhf(v_one, vtanh);
+      HVX_Vector t = Q6_Vhf_vmpy_VhfVhf(vx, one_plus_tanh);
+      HVX_Vector vy = Q6_Vhf_vmpy_VhfVhf(t, v_half);
+
+      q6op_vstu_AV(&tmp_out[i], vy);
+    }
+#else
+    // Scalar fallback
+    for (uint32_t i = 0; i < N; ++i) {
+      const float v = xData[i];
+      const float inner = 0.7978845608f * (v + 0.044715f * v * v * v);
+      yData[i] = 0.5f * v * (1.0f + std::tanh(inner));
+    }
+#endif
+
+    for (uint32_t i = 0; i < N; ++i) {
+      yData[i] = static_cast<float>(tmp_out[i]);
+    }
+    return GraphStatus::Success;
+  } else if (in_info.dtype == DType::QUInt8) {
+    const uint8_t* xData = static_cast<const uint8_t*>(x.raw_data_const());
+    uint8_t* yData = static_cast<uint8_t*>(y.raw_data());
+
+    const float x_scale = in_info.scale;
+    const float y_scale = out_info.scale;
+    const int32_t x_zero = in_info.offset;
+    const int32_t y_zero = out_info.offset;
+
+    alignas(128) static uint8_t lut[256];
+    static bool lut_init = false;
+    if (!lut_init) {
+      for (int i = 0; i < 256; ++i) {
+        float x_f = (i - x_zero) * x_scale;
+        float inner = 0.7978845608f * (x_f + 0.044715f * x_f * x_f * x_f);
+        float y_f = 0.5f * x_f * (1.0f + std::tanh(inner));
+        int y_q = static_cast<int>(std::round(y_f / y_scale)) + y_zero;
+        lut[i] = static_cast<uint8_t>(std::clamp(y_q, 0, 255));
+      }
+      lut_init = true;
+    }
+    for (uint32_t i = 0; i < N; ++i) {
+      yData[i] = lut[xData[i]];
+    }
+    return GraphStatus::Success;
+  } else {
+    return GraphStatus::ErrorFatal;
+  }
 }
 
 __attribute__((unused)) static float fastgeluCostFunc(const Op* op) {