quic
diff --git a/‎BUILD.md‎
Lines changed: 2 additions & 2 deletions b/‎BUILD.md‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎pybind/AppBuilder.cpp‎
Lines changed: 1 addition & 0 deletions b/‎pybind/AppBuilder.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pybind/AppBuilder.h‎
Lines changed: 51 additions & 4 deletions b/‎pybind/AppBuilder.h‎
Lines changed: 51 additions & 4 deletions
diff --git a/‎script/qai_appbuilder/qnncontext.py‎
Lines changed: 1 addition & 3 deletions b/‎script/qai_appbuilder/qnncontext.py‎
Lines changed: 1 addition & 3 deletions
diff --git a/‎setup.py‎
Lines changed: 4 additions & 0 deletions b/‎setup.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎src/LibAppBuilder.cpp‎
Lines changed: 4 additions & 4 deletions b/‎src/LibAppBuilder.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎src/LibAppBuilder.hpp‎
Lines changed: 9 additions & 10 deletions b/‎src/LibAppBuilder.hpp‎
Lines changed: 9 additions & 10 deletions
@@ -20,7 +20,7 @@
 - Install Visual Studio 2022: 
   - https://docs.qualcomm.com/bundle/publicresource/topics/80-62010-1/setup.html?product=1601111740057789
 - Install x64 version [Python-3.12.8](https://www.python.org/ftp/python/3.12.8/python-3.12.8-amd64.exe) or install arm64 version [Python-3.12.6](https://github.com/quic/ai-engine-direct-helper/blob/main/docs/python_arm64.md) if your app is running on arm64.
-  - 
+
 - Use the commands below to install Python dependency: 
 ```
 pip install wheel==0.45.1 setuptools==75.8.0 pybind11==2.13.6
@@ -45,7 +45,7 @@ Set QNN_SDK_ROOT=C:\Qualcomm\AIStack\QAIRT\2.42.0.251225\
 cd ai-engine-direct-helper
 python setup.py --toolchains <Supported Toolchains> --hexagonarch <Hexagon Arch> bdist_wheel
 
-#for example: 
+# For example: 
 python setup.py --toolchains arm64x-windows-msvc --hexagonarch 73 bdist_wheel
 
 # If you use below command, it will compile with default Toolchains and Hexagon Arch.
 
@@ -13,6 +13,7 @@
 
 ShareMemory::ShareMemory(const std::string& share_memory_name, const size_t share_memory_size) {
     m_share_memory_name = share_memory_name;
+    m_share_memory_size = share_memory_size;
     g_LibAppBuilder.CreateShareMemory(share_memory_name, share_memory_size);
 }
 
 
@@ -70,6 +70,33 @@ static inline py::dtype dtypeFromString(const std::string& dtypeStr) {
     return py::dtype::of<uint8_t>();
 }
 
+// ---------------------------------------------------------------------------------
+// Helper: case-insensitive "float32 request" for input_data_type/output_data_type
+// Accepts: "float", "float32", "fp32"
+// ---------------------------------------------------------------------------------
+static inline bool isFloat32Request(const std::string& s) {
+    std::string t = s;
+    for (auto& c : t) c = static_cast<char>(::tolower(c));
+    return (t == "float" || t == "float32" || t == "fp32");
+}
+
+// ---------------------------------------------------------------------------------
+// Helper: identify if a py::dtype is float32 (NumPy kind 'f' and itemsize == 4)
+// Note: We avoid relying on dtype object identity; use kind/itemsize instead.
+// ---------------------------------------------------------------------------------
+static inline bool isNumpyFloat32Dtype(const py::dtype& dt) {
+    try {
+        // dt.kind is a 1-char string in NumPy, e.g. 'f' for floating
+        std::string kindStr = py::str(dt.attr("kind"));
+        char kind = kindStr.empty() ? '\0' : kindStr[0];
+        py::ssize_t itemsize = dt.attr("itemsize").cast<py::ssize_t>();
+        return (kind == 'f' && itemsize == 4);
+    } catch (...) {
+        // conservative fallback
+        return false;
+    }
+}
+
 // ---------------------------------------------------------------------------
 // Helper: product of dims (for output element count)
 // ---------------------------------------------------------------------------
@@ -206,7 +233,8 @@ std::vector<py::array> inference(std::string model_name, const std::vector<py::a
 
     // Keep temporary converted/contiguous arrays alive during ModelInference
     std::vector<py::array> keepAlive;
-    const bool floatMode = (input_data_type == "float");
+    const bool floatMode = isFloat32Request(input_data_type);
+    const bool floatOutMode = isFloat32Request(output_data_type);
 
     //QNN_INF("inference input vector length: %d\n", input.size());
 
@@ -271,7 +299,16 @@ std::vector<py::array> inference(std::string model_name, const std::vector<py::a
                         { static_cast<py::ssize_t>(dt.itemsize()) },
                         outputBuffers[i],
                         free_data);
-        output.push_back(result);
+
+        // If user requests float output, cast to float32 before returning.
+        // IMPORTANT: do NOT reinterpret the raw buffer as float32 (size may not match).
+        // We first create 'result' using the inferred real dtype, then cast (copy) if needed.
+        if (floatOutMode && !isNumpyFloat32Dtype(dt)) {
+            py::array_t<float, py::array::c_style | py::array::forcecast> farr(result);
+            output.push_back(py::array(farr));
+        } else {
+            output.push_back(result);
+        }
     }
     //print_time("convert Data To ArrayV");
 
@@ -288,7 +325,8 @@ std::vector<py::array> inference_P(std::string model_name, std::string proc_name
 
     // Keep temporary converted/contiguous arrays alive during ModelInference
     std::vector<py::array> keepAlive;
-    const bool floatMode = (input_data_type == "float");
+    const bool floatMode = isFloat32Request(input_data_type);
+    const bool floatOutMode = isFloat32Request(output_data_type);
 
     for (auto i = 0; i < input.size(); i++) {
         if (floatMode) {
@@ -351,7 +389,15 @@ std::vector<py::array> inference_P(std::string model_name, std::string proc_name
                         { static_cast<py::ssize_t>(dt.itemsize()) },
                         outputBuffers[i],
                         free_data);
-        output.push_back(result);
+
+        // If user requests float output, cast to float32 before returning.
+        // For shared memory outputs, this will create a float32 copy (shared memory remains untouched).
+        if (floatOutMode && !isNumpyFloat32Dtype(dt)) {
+            py::array_t<float, py::array::c_style | py::array::forcecast> farr(result);
+            output.push_back(py::array(farr));
+        } else {
+            output.push_back(result);
+        }
     }
     //print_time("convert Data To ArrayV");
 
@@ -371,6 +417,7 @@ int delete_memory(std::string share_memory_name) {
 class ShareMemory {
 public:
     std::string m_share_memory_name;
+    size_t m_share_memory_size = 0;
 
     ShareMemory(const std::string& share_memory_name, const size_t share_memory_size);
     ~ShareMemory();
 
@@ -222,7 +222,6 @@ def __init__(self,
                  model_path: str = "None",
                  backend_lib_path: str = "None",
                  system_lib_path: str = "None",
-                 runtime: str = Runtime.HTP,
                  is_async: bool = False,
                  input_data_type: str = DataType.FLOAT,
                  output_data_type: str = DataType.FLOAT
@@ -258,7 +257,6 @@ def __init__(self,
                  model_path: str = "None",
                  backend_lib_path: str = "None",
                  system_lib_path: str = "None",
-                 runtime: str = Runtime.HTP,
                  is_async: bool = False,
                  input_data_type: str = DataType.FLOAT,
                  output_data_type: str = DataType.FLOAT
@@ -300,7 +298,6 @@ def __init__(self,
                  backend_lib_path: str = "None",
                  system_lib_path: str = "None",
                  lora_adapters=None,
-                 runtime: str = Runtime.HTP,
                  is_async: bool = False,
                  input_data_type: str = DataType.FLOAT,
                  output_data_type: str = DataType.FLOAT
@@ -355,6 +352,7 @@ def __init__(self,
         """
         self.share_memory_name = share_memory_name
         self.m_memory = appbuilder.ShareMemory(share_memory_name, share_memory_size)
+        self.share_memory_size = share_memory_size
 
     #@timer
     def __del__(self):
 
@@ -141,6 +141,8 @@ def build_clean():
     if os.path.exists(binary_path + "/QAIAppSvc.exe"):
         os.remove(binary_path + "/libappbuilder.dll")
         os.remove(binary_path + "/QAIAppSvc.exe")
+    if os.path.exists(binary_path + "/QAIAppSvc.pdb"):
+        os.remove(binary_path + "/QAIAppSvc.pdb")
     if os.path.exists(binary_path + "/libappbuilder.pdb"):
         os.remove(binary_path + "/libappbuilder.pdb")
     if os.path.exists(binary_path + "/libappbuilder.so"):
@@ -164,6 +166,8 @@ def build_cmake():
     if os.path.exists("lib/" + CONFIG + "/QAIAppSvc.exe"):
         shutil.copy("lib/" + CONFIG +"/libappbuilder.dll", binary_path)
         shutil.copy("lib/" + CONFIG + "/QAIAppSvc.exe", binary_path)
+    if os.path.exists("lib/" + CONFIG + "/QAIAppSvc.pdb"):
+        shutil.copy("lib/" + CONFIG + "/QAIAppSvc.pdb", binary_path)
     if os.path.exists("lib/" + CONFIG + "/libappbuilder.pdb"):
         shutil.copy("lib/" + CONFIG + "/libappbuilder.pdb", binary_path)
     if os.path.exists("lib/" + "libappbuilder.so"):
 
@@ -465,7 +465,7 @@ bool ModelInitializeEx(const std::string& model_name, const std::string& proc_na
 bool ModelInferenceEx(std::string model_name, std::string proc_name, std::string share_memory_name,
                       std::vector<uint8_t*>& inputBuffers, std::vector<size_t>& inputSize,
                       std::vector<uint8_t*>& outputBuffers, std::vector<size_t>& outputSize,
-                      std::string& perfProfile, size_t graphIndex) {
+                      std::string& perfProfile, size_t graphIndex, size_t share_memory_size=0) {
     bool result = true;
 
     //QNN_INF("LibAppBuilder::ModelInference: %s \n", model_name.c_str());
@@ -487,7 +487,7 @@ bool ModelInferenceEx(std::string model_name, std::string proc_name, std::string
         result = false;
     }
 
-    if (result && sample_app::StatusCode::SUCCESS != app->executeGraphsBuffers(inputBuffers, outputBuffers, outputSize, perfProfile, graphIndex)) {
+    if (result && sample_app::StatusCode::SUCCESS != app->executeGraphsBuffers(inputBuffers, outputBuffers, outputSize, perfProfile, graphIndex, share_memory_size)) {
         app->reportError("Graph Execution failure");
         result = false;
     }
@@ -599,9 +599,9 @@ bool LibAppBuilder::ModelInference(std::string model_name, std::string proc_name
 
 bool LibAppBuilder::ModelInference(std::string model_name, std::vector<uint8_t*>& inputBuffers, 
                                    std::vector<uint8_t*>& outputBuffers, std::vector<size_t>& outputSize,
-                                   std::string& perfProfile, size_t graphIndex){
+                                   std::string& perfProfile, size_t graphIndex, size_t share_memory_size){
     std::vector<size_t> inputSize;
-    return ModelInferenceEx(model_name, "", "", inputBuffers, inputSize, outputBuffers, outputSize, perfProfile, graphIndex);
+    return ModelInferenceEx(model_name, "", "", inputBuffers, inputSize, outputBuffers, outputSize, perfProfile, graphIndex, share_memory_size);
 }
 
 bool LibAppBuilder::ModelApplyBinaryUpdate(const std::string model_name, std::vector<LoraAdapter>& lora_adapters) {
 
@@ -63,7 +63,7 @@ class LIBAPPBUILDER_API LibAppBuilder
 
     bool ModelInference(std::string model_name, std::vector<uint8_t*>& inputBuffers, 
                         std::vector<uint8_t*>& outputBuffers, std::vector<size_t>& outputSize,
-                        std::string& perfProfile, size_t graphIndex = 0);
+                        std::string& perfProfile, size_t graphIndex = 0, size_t share_memory_size = 0);
     bool ModelInference(std::string model_name, std::string proc_name, std::string share_memory_name,
                         std::vector<uint8_t*>& inputBuffers, std::vector<size_t>& inputSize,
                         std::vector<uint8_t*>& outputBuffers, std::vector<size_t>& outputSize,
@@ -77,34 +77,33 @@ class LIBAPPBUILDER_API LibAppBuilder
     bool CreateShareMemory(std::string share_memory_name, size_t share_memory_size);
     bool DeleteShareMemory(std::string share_memory_name);
 
-    // issue#24
     std::vector<std::vector<size_t>> getInputShapes(std::string model_name);
     std::vector<std::string> getInputDataType(std::string model_name);
     std::vector<std::string> getOutputDataType(std::string model_name);
     std::vector<std::vector<size_t>> getOutputShapes(std::string model_name);
     std::string getGraphName(std::string model_name);
     std::vector<std::string> getInputName(std::string model_name);
     std::vector<std::string> getOutputName(std::string model_name);
-    ModelInfo_t getModelInfo(std::string model_name, std::string proc_name, std::string input);
-    ModelInfo_t getModelInfo(std::string model_name, std::string input);
-    ModelInfo_t getModelInfoExt(std::string model_name, std::string input);  
-	//proc 
+
     std::vector<std::vector<size_t>> getInputShapes(std::string model_name, std::string proc_name);
     std::vector<std::string> getInputDataType(std::string model_name, std::string proc_name);
-    std::vector<std::string> getInputName(std::string model_name, std::string proc_name);
-    std::string getGraphName(std::string model_name, std::string proc_name);
     std::vector<std::string> getOutputDataType(std::string model_name, std::string proc_name);
     std::vector<std::vector<size_t>> getOutputShapes(std::string model_name, std::string proc_name);
+    std::string getGraphName(std::string model_name, std::string proc_name);
+    std::vector<std::string> getInputName(std::string model_name, std::string proc_name);
     std::vector<std::string> getOutputName(std::string model_name, std::string proc_name);                                                      
-    // issue#24
+
+    ModelInfo_t getModelInfo(std::string model_name, std::string input);
+    ModelInfo_t getModelInfo(std::string model_name, std::string proc_name, std::string input);
+    ModelInfo_t getModelInfoExt(std::string model_name, std::string input);  
+
     std::vector<std::vector<size_t>> m_inputShapes;
     std::vector<std::string> m_inputDataType;
     std::vector<std::vector<size_t>> m_outputShapes;
     std::vector<std::string> m_outputDataType;
     std::string m_graphName;
     std::vector<std::string> m_inputName;
     std::vector<std::string> m_outputName;
-
 };
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,7 @@`
`13`	`13`
`14`	`14`	`ShareMemory::ShareMemory(const std::string& share_memory_name, const size_t share_memory_size) {`
`15`	`15`	`m_share_memory_name = share_memory_name;`
	`16`	`+ m_share_memory_size = share_memory_size;`
`16`	`17`	`g_LibAppBuilder.CreateShareMemory(share_memory_name, share_memory_size);`
`17`	`18`	`}`
`18`	`19`