use intrinsics for mul3

cccclai · cccclai · commit bb88bb6925dc · 2025-11-02T16:57:06.000-08:00
diff --git a/examples/qualcomm/custom_op/custom_ops_1.py b/examples/qualcomm/custom_op/custom_ops_1.py
@@ -69,16 +69,13 @@ def annotate_custom(gm: torch.fx.GraphModule) -> None:
     This function is specific for custom op.
     The source_fn of the rewritten nn module turns out to be "my_ops.mul3.default"
     """
-    from executorch.backends.qualcomm.quantizer.annotators import (
-        _is_annotated,
-        QUANT_ANNOTATION_KEY,
-    )
-
+    from executorch.backends.qualcomm.quantizer.annotators import _is_annotated
     from executorch.backends.qualcomm.quantizer.qconfig import (
         get_ptq_per_channel_quant_config,
     )
     from torch.fx import Node
     from torchao.quantization.pt2e.quantizer import QuantizationAnnotation
+    from torchao.quantization.pt2e.quantizer.quantizer import Q_ANNOTATION_KEY
 
     quantization_config = get_ptq_per_channel_quant_config()
     for node in gm.graph.nodes:
@@ -95,7 +92,7 @@ def annotate_custom(gm: torch.fx.GraphModule) -> None:
         input_spec = quantization_config.input_activation
         input_qspec_map[input_act] = input_spec
 
-        node.meta[QUANT_ANNOTATION_KEY] = QuantizationAnnotation(
+        node.meta[Q_ANNOTATION_KEY] = QuantizationAnnotation(
             input_qspec_map=input_qspec_map,
             output_qspec=quantization_config.output_activation,
             _annotated=True,
@@ -180,7 +177,8 @@ def main(args):
     # ensure the working directory exist.
     os.makedirs(args.artifact, exist_ok=True)
 
-    quant_dtype = QuantDtype.use_8a8w
+    # quant_dtype: Literal[QuantDtype.use_16a16w] = QuantDtype.use_16a16w
+    quant_dtype: Literal[QuantDtype.use_16a16w] = QuantDtype.use_8a8w
     if args.use_fp16:
         quant_dtype = None
 
@@ -197,9 +195,11 @@ def main(args):
         soc_info.htp_info.htp_arch,
         args.build_op_package,
     )
-    quantizer = make_quantizer(
-        quant_dtype=quant_dtype, custom_annotations=(annotate_custom,)
-    )
+    quantizer = None
+    if not args.use_fp16:
+        quantizer = make_quantizer(
+            quant_dtype=quant_dtype, custom_annotations=(annotate_custom,)
+        )
 
     build_executorch_binary(
         instance,
@@ -228,13 +228,14 @@ def main(args):
 
         runner_cmd = " ".join(
             [
-                f"export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
+                f"export QNN_FARF_LEVEL=4 && export LD_LIBRARY_PATH={qnn_sdk}/lib/{target}/:{args.build_folder}/lib &&",
                 f"./{args.build_folder}/examples/qualcomm/executor_runner/qnn_executor_runner",
                 f"--model_path {args.artifact}/{pte_filename}.pte",
                 f"--input_list_path {args.artifact}/{input_list_filename}",
                 f"--output_folder_path {output_data_folder}",
             ]
         )
+
         subprocess.run(
             runner_cmd,
             # stdout=subprocess.PIPE,
@@ -258,6 +259,7 @@ def main(args):
             device_id=args.device,
             host_id=args.host,
             soc_model=args.model,
+            shared_buffer=args.shared_buffer,
         )
         adb.push(inputs=sample_input, files=op_package_paths)
         adb.execute()
diff --git a/examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage/Makefile b/examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage/Makefile
@@ -187,6 +187,32 @@ HEXAGON_CXX_FLAGS_V73 := $(HEXAGON_CXX_FLAGS) -mv73 -I$(HEXAGON_SDK_ROOT_V73)/rt
 HEXAGON_CXX_FLAGS_V75 := $(HEXAGON_CXX_FLAGS) -mv75 -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/qurt -I$(HEXAGON_SDK_ROOT_V75)/rtos/qurt/computev75/include/posix -I$(HEXAGON_SDK_ROOT_V75)/incs -I$(HEXAGON_SDK_ROOT_V75)/incs/stddef
 HEXAGON_CXX_FLAGS_V79 := $(HEXAGON_CXX_FLAGS) -mv79 -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/qurt -I$(HEXAGON_SDK_ROOT_V79)/rtos/qurt/computev79/include/posix -I$(HEXAGON_SDK_ROOT_V79)/incs -I$(HEXAGON_SDK_ROOT_V79)/incs/stddef
 
+QHL_HVX_DIR := $(HEXAGON_SDK_ROOT)/libs/qhl_hvx
+QHL_HVX_INC_DIRS := \
+  $(QHL_HVX_DIR)/inc/internal \
+  $(QHL_HVX_DIR)/inc/qhdsp_hvx \
+  $(QHL_HVX_DIR)/inc/qhblas_hvx \
+  $(QHL_HVX_DIR)/inc/qhmath_hvx
+
+HEXAGON_CXX_FLAGS_V79 += $(addprefix -I,$(QHL_HVX_INC_DIRS))
+
+QHL_DIR := $(HEXAGON_SDK_ROOT)/libs/qhl
+QHL_INC_DIRS := \
+  $(QHL_DIR)/inc/qhmath \
+  $(QHL_DIR)/inc/qhcomplex \
+  $(QHL_DIR)/inc/qhdsp \
+  $(QHL_DIR)/inc/qhblas
+
+QHL_LIBS := \
+  $(HEXAGON_SDK_ROOT)/libs/qhl/prebuilt/hexagon_toolv88_v79/libqhblas.a \
+  $(HEXAGON_SDK_ROOT)/libs/qhl/prebuilt/hexagon_toolv88_v79/libqhdsp.a \
+  $(HEXAGON_SDK_ROOT)/libs/qhl/prebuilt/hexagon_toolv88_v79/libqhmath.a \
+  $(HEXAGON_SDK_ROOT)/libs/qhl/prebuilt/hexagon_toolv88_v79/libqhcomplex.a \
+  $(HEXAGON_SDK_ROOT)/libs/qhl_hvx/prebuilt/hexagon_toolv88_v79/libqhdsp_hvx.a \
+  $(HEXAGON_SDK_ROOT)/libs/qhl_hvx/prebuilt/hexagon_toolv88_v79/libqhblas_hvx.a
+
+HEXAGON_CXX_FLAGS_V79 += $(addprefix -I,$(QHL_INC_DIRS))
+
 $(info "HEXAGON_TOOLS_VERSION_V68 is [${HEXAGON_TOOLS_VERSION_V68}]")
 $(info "HEXAGON_TOOLS_VERSION_V69 is [${HEXAGON_TOOLS_VERSION_V69}]")
 $(info "HEXAGON_TOOLS_VERSION_V73 is [${HEXAGON_TOOLS_VERSION_V73}]")
@@ -253,7 +279,6 @@ HEXAGON_BUILD_V75:  $(WORK)/hexagon-v75/$(LIBRARY_NAME)
 HEXAGON_BUILD_V79:  $(WORK)/hexagon-v79/$(LIBRARY_NAME)
 
 
-
 X86_BUILD: $(WORK)/x86_64-linux-clang/$(LIBRARY_NAME)
 
 
@@ -366,8 +391,11 @@ $(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/%.cpp | $(WORK)/hexagon-v79
 $(WORK)/hexagon-v79/ops/%.o: $(OP_SRC_DIR)/v79_asm/%.S | $(WORK)/hexagon-v79
 	$(HEXAGON_CXX_V79) $(HEXAGON_CXX_FLAGS_V79) -DTHIS_PKG_NAME=$(PACKAGE_NAME) -MMD -c $< -o $@
 
-$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES)
-	$(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+# $(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs) | $(HFILES)
+# 	$(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(HEX_LDFLAGS)
+
+$(WORK)/hexagon-v79/$(LIBRARY_NAME): $(hexagon-v79_objs)
+	$(HEXAGON_CXX_V79) -fPIC -std=c++17 -g -shared -o $@ $^ $(QHL_LIBS) $(HEX_LDFLAGS)
 
 
 
diff --git a/examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage/src/ExampleOpPackageInterface.cpp b/examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage/src/ExampleOpPackageInterface.cpp
@@ -12,6 +12,12 @@
 #include "QnnOpPackage.h"
 #include "QnnSdkBuildId.h"
 
+#ifdef __hexagon__
+#include "qhblas_hvx.h" // may re-export symbols in qhblas
+#include "qhcomplex.h"
+#include "qhdsp_hvx.h" // still present under qhl/inc/qhdsp
+#endif
+
 DEFINE_UNIQ_TY()
 BEGIN_PKG_OPS_OPTS_LIST()
 
diff --git a/examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage/src/ops/ExampleCustomOp.cpp b/examples/qualcomm/custom_op/example_op_package_htp/ExampleOpPackage/src/ops/ExampleCustomOp.cpp
@@ -3,11 +3,13 @@
 //==============================================================================
 
 #include "HTP/core/constraints.h"
+#include "HTP/core/intrinsics.h"
 #include "HTP/core/op_package_feature_support.h"
 #include "HTP/core/op_register_ext.h"
 #include "HTP/core/optimize.h"
 #include "HTP/core/simple_reg.h"
 #include "QnnOpPackage.h"
+#include "hexagon_protos.h"
 #include "hexagon_types.h"
 #include "hvx_hexagon_protos.h"
 
@@ -163,7 +165,51 @@ GraphStatus examplecustomopImpl(TensorType& out_0, const TensorType& in_0)
   if (input_intfc.dtype == DType::Float32) {
     const float* p_input = static_cast<const float*>(in_0.raw_data_const());
     float* p_output = static_cast<float*>(out_0.raw_data());
-    const int multiplier = 3;
+    const size_t N = in_0.total_storage_elements();
+
+    // Allocate temporary FP16 buffers on stack or heap
+    std::vector<Float16> tmp_in(N);
+    std::vector<Float16> tmp_out(N);
+
+    // 1. Convert FP32 -> FP16
+    for (size_t i = 0; i < N; ++i) {
+      tmp_in[i] = static_cast<Float16>(p_input[i]);
+    }
+
+#ifdef __hexagon__
+    // 2. Run HVX multiply (FP16 domain)
+    union {
+      Float16 f16;
+      uint16_t bits;
+    } f3 = {static_cast<Float16>(3.0f)};
+    HVX_Vector v_mul = Q6_Vh_vsplat_R(f3.bits);
+
+    const int vector_bytes = 128;
+    const int elems_per_vec = vector_bytes / sizeof(Float16);
+
+    for (size_t i = 0; i < N; i += elems_per_vec) {
+      HVX_Vector vin = q6op_V_vldu_A(&tmp_in[i]);
+      HVX_Vector vout = Q6_Vhf_vmpy_VhfVhf(vin, v_mul);
+      q6op_vstu_AV(&tmp_out[i], vout);
+    }
+#else
+    // 2. Fallback scalar multiply
+    for (size_t i = 0; i < N; ++i) {
+      tmp_out[i] = static_cast<Float16>(tmp_in[i] * static_cast<Float16>(3.0f));
+    }
+#endif
+
+    // 3. Convert FP16 -> FP32
+    for (size_t i = 0; i < N; ++i) {
+      p_output[i] = static_cast<float>(tmp_out[i]);
+    }
+
+    return GraphStatus::Success;
+  } else if (input_intfc.dtype == DType::QUInt8) {
+    // printf("[QNN ExecuTorch Op Package test] input is QUInt8\n");
+    const uint8_t* p_input = static_cast<const uint8_t*>(in_0.raw_data_const());
+    uint8_t* p_output = static_cast<uint8_t*>(out_0.raw_data());
+    const int multiplier = 3 * input_intfc.scale / out_intfc.scale;
     for (size_t i = 0; i < input_num_elements; ++i) {
       p_output[i] = multiplier * p_input[i];
 
@@ -177,59 +223,6 @@ GraphStatus examplecustomopImpl(TensorType& out_0, const TensorType& in_0)
           i,
           p_output[i]);
     }
-  } else if (input_intfc.dtype == DType::QUInt8) {
-    // const uint8_t* p_input = static_cast<const
-    // uint8_t*>(in_0.raw_data_const()); uint8_t* p_output =
-    // static_cast<uint8_t*>(out_0.raw_data()); const int multiplier = 3 *
-    // input_intfc.scale / out_intfc.scale; for (size_t i = 0; i <
-    // input_num_elements; ++i) {
-    //   p_output[i] = multiplier * p_input[i];
-
-    //   FARF(
-    //       ALWAYS,
-    //       "[QNN ExecuTorch Op Package test]"
-    //       "input0[%zu]=%f, multiplier=%d, output[%zu]=%f",
-    //       i,
-    //       p_input[i],
-    //       multiplier,
-    //       i,
-    //       p_output[i]);
-    // }
-
-    const uint8_t* p_input = static_cast<const uint8_t*>(in_0.raw_data_const());
-    uint8_t* p_output = static_cast<uint8_t*>(out_0.raw_data());
-    const float multiplier_f = 3.0f * input_intfc.scale / out_intfc.scale;
-    const int multiplier =
-        static_cast<int>(multiplier_f * 128.0f); // fixed-point scale
-
-    const HVX_Vector* in_vec = reinterpret_cast<const HVX_Vector*>(p_input);
-    HVX_Vector* out_vec = reinterpret_cast<HVX_Vector*>(p_output);
-
-    HVX_Vector v_mult = Q6_V_vsplat_R(multiplier & 0xFF);
-    HVX_Vector vzero = Q6_V_vzero();
-
-    const size_t vec_elems = 128; // 128 bytes per HVX vector
-    const size_t nvecs = input_num_elements / vec_elems;
-
-    for (size_t i = 0; i < nvecs; ++i) {
-      HVX_Vector vin = Q6_V_vldu_A(in_vec + i);
-      HVX_Vector vout;
-
-#if defined(__HEXAGON_ARCH__)
-      // use available multiply intrinsic
-      vout = Q6_Vub_vmpy_VubRb_s1_rnd_sat(vin, v_mult);
-#else
-      // fallback scalar multiply for x86 simulation
-      alignas(128) uint8_t tmp_in[128], tmp_out[128];
-      memcpy(tmp_in, p_input + i * 128, 128);
-      for (int j = 0; j < 128; ++j)
-        tmp_out[j] = std::min(255, (tmp_in[j] * multiplier) >> 7);
-      memcpy(p_output + i * 128, tmp_out, 128);
-      continue;
-#endif
-
-      Q6_V_vstu_A(out_vec + i, vout);
-    }
   }
 
   return GraphStatus::Success;