pytorch
diff --git a/‎.ci/scripts/test_model.sh‎
Lines changed: 4 additions & 1 deletion b/‎.ci/scripts/test_model.sh‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/trunk.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 59 additions & 0 deletions b/‎backends/qualcomm/tests/test_qnn_delegate.py‎
Lines changed: 59 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/VulkanBackend.cpp‎
Lines changed: 14 additions & 9 deletions b/‎backends/vulkan/runtime/VulkanBackend.cpp‎
Lines changed: 14 additions & 9 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 43 additions & 0 deletions b/‎backends/vulkan/runtime/graph/ComputeGraph.cpp‎
Lines changed: 43 additions & 0 deletions
diff --git a/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 43 additions & 1 deletion b/‎backends/vulkan/runtime/graph/ComputeGraph.h‎
Lines changed: 43 additions & 1 deletion
diff --git a/‎backends/vulkan/runtime/graph/ops/BlitNode.cpp‎
Lines changed: 6 additions & 8 deletions b/‎backends/vulkan/runtime/graph/ops/BlitNode.cpp‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp‎
Lines changed: 12 additions & 14 deletions b/‎backends/vulkan/runtime/graph/ops/PrepackNode.cpp‎
Lines changed: 12 additions & 14 deletions
@@ -199,6 +199,9 @@ test_model_with_qnn() {
     EXPORT_SCRIPT=albert
   elif [[ "${MODEL_NAME}" == "bert" ]]; then
     EXPORT_SCRIPT=bert
+  elif [[ "${MODEL_NAME}" == "conv_former" ]]; then
+    EXPORT_SCRIPT=conv_former
+    EXTRA_FLAGS="--dataset imagenet-mini/val"
   elif [[ "${MODEL_NAME}" == "cvt" ]]; then
     EXPORT_SCRIPT=cvt
   elif [[ "${MODEL_NAME}" == "distilbert" ]]; then
@@ -238,7 +241,7 @@ test_model_with_qnn() {
     "cvt"|"dit"|"focalnet"|"mobilevit_v2"|"pvt"|"swin")
         SCRIPT_FOLDER=oss_scripts
         ;;
-    "albert"|"bert"|"distilbert"|"roberta"|"efficientnet"|"mobilevit_v1")
+    "albert"|"bert"|"conv_former"|"distilbert"|"roberta"|"efficientnet"|"mobilevit_v1")
         pip install evaluate
         SCRIPT_FOLDER=oss_scripts
         # 16bit models will encounter op validation fail on some operations,
 
@@ -568,7 +568,7 @@ jobs:
     strategy:
       matrix:
         dtype: [fp32]
-        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l]
+        model: [dl3, mv3, mv2, ic4, ic3, vit, mb, w2l, conv_former]
       fail-fast: false
     with:
       runner: linux.2xlarge
 
@@ -4588,6 +4588,65 @@ def test_static_qwen2_5(self):
                         msg["inference_speed"], inference_speed_ref[self.model]
                     )
 
+    def test_qwen3(self):
+        if not self.required_envs():
+            self.skipTest("missing required envs")
+
+        prompt = "My favourite condiment is "
+        cmds = [
+            "python",
+            f"{self.executorch_root}/examples/qualcomm/oss_scripts/llama/llama.py",
+            "--artifact",
+            self.artifact_dir,
+            "--build_folder",
+            self.build_folder,
+            "--model",
+            self.model,
+            "--ip",
+            self.ip,
+            "--port",
+            str(self.port),
+            "--prompt",
+            f"{prompt}",
+            "--ptq",
+            "16a8w",
+            "--decoder_model",
+            "qwen3_0.6b",
+            "--model_mode",
+            "hybrid",
+            "--prefill_ar_len",
+            "32",
+            "--max_seq_len",
+            "128",
+        ]
+        if self.compile_only:
+            cmds.extend(["--compile_only"])
+        elif self.device:
+            cmds.extend(["--device", self.device])
+        if self.host:
+            cmds.extend(["--host", self.host])
+        elif self.enable_x86_64:
+            cmds.extend(["--enable_x86_64"])
+        if self.pre_gen_pte:
+            cmds.extend(["--pre_gen_pte", self.pre_gen_pte])
+
+        # Accuracy is bad for now. Just check user's prompt is returned.
+        golden_start_with = "My favourite condiment is "
+        p = subprocess.Popen(cmds, stdout=subprocess.DEVNULL)
+        with Listener((self.ip, self.port)) as listener:
+            conn = listener.accept()
+            p.communicate()
+            msg = json.loads(conn.recv())
+            if "Error" in msg:
+                self.fail(msg["Error"])
+            else:
+                model_out = msg["result"][0]
+                self.assertTrue(
+                    model_out.startswith(golden_start_with),
+                    f"Expected Output: {golden_start_with}. Actual Output: {model_out}",
+                )
+                self.assertGreaterEqual(msg["inference_speed"], 70)  # Lanai
+
 
 class TestExampleOssScript(TestQNN):
     def test_albert(self):
 
@@ -390,18 +390,20 @@ bool maybe_resize_input(
     const size_t input_i,
     executorch::aten::Tensor& et_tensor) {
   ValueRef in_tensor_ref = graph->inputs()[input_i].value;
-  vTensorPtr in_tensor = graph->get_tensor(in_tensor_ref);
+
+  const std::vector<int64_t> in_tensor_vk_sizes =
+      graph->sizes_of(in_tensor_ref);
 
   ET_CHECK_MSG(
-      et_tensor.dim() == in_tensor->sizes().size(),
+      et_tensor.dim() == in_tensor_vk_sizes.size(),
       "Cannot resize input tensor: old ndim %zu does not match new ndim %zu",
-      static_cast<size_t>(in_tensor->sizes().size()),
+      static_cast<size_t>(in_tensor_vk_sizes.size()),
       static_cast<size_t>(et_tensor.dim()));
 
   bool should_resize = false;
   std::vector<int64_t> new_sizes(et_tensor.dim());
   for (size_t i = 0; i < et_tensor.dim(); i++) {
-    if (in_tensor->sizes()[i] != et_tensor.sizes()[i]) {
+    if (in_tensor_vk_sizes[i] != et_tensor.sizes()[i]) {
       should_resize = true;
     }
     new_sizes.at(i) = et_tensor.sizes()[i];
@@ -411,10 +413,11 @@ bool maybe_resize_input(
     graph->resize_input(input_i, new_sizes);
   }
 
+  const size_t in_tensor_vk_numel = graph->numel_of(in_tensor_ref);
   ET_CHECK_MSG(
-      in_tensor->numel() == et_tensor.numel(),
+      in_tensor_vk_numel == et_tensor.numel(),
       "Vulkan tensor numel %zu does not match ET tensor numel %zu",
-      static_cast<size_t>(in_tensor->numel()),
+      static_cast<size_t>(in_tensor_vk_numel),
       static_cast<size_t>(et_tensor.numel()));
 
   return should_resize;
@@ -445,12 +448,14 @@ void maybe_resize_output(
     const size_t output_i,
     executorch::aten::Tensor& et_tensor) {
   ValueRef out_tensor_ref = graph->outputs()[output_i].value;
-  vTensorPtr out_tensor = graph->get_tensor(out_tensor_ref);
+
+  const std::vector<int64_t> out_tensor_vk_sizes =
+      graph->sizes_of(out_tensor_ref);
 
   executorch::aten::SizesType new_output_size[kTensorDimensionLimit];
-  size_t ndim = out_tensor->sizes().size();
+  size_t ndim = out_tensor_vk_sizes.size();
   for (int i = 0; i < ndim; ++i) {
-    new_output_size[i] = out_tensor->sizes()[i];
+    new_output_size[i] = out_tensor_vk_sizes[i];
   }
 
   executorch::aten::ArrayRef<executorch::aten::SizesType> output_size{
 
@@ -704,6 +704,38 @@ utils::uvec3 ComputeGraph::create_local_wg_size(const ValueRef idx) {
   return create_local_wg_size(create_global_wg_size(idx));
 }
 
+void ComputeGraph::bind_tensor_to_descriptor_set(
+    const ValueRef ref,
+    vkapi::PipelineBarrier& pipeline_barrier,
+    const vkapi::MemoryAccessFlags access_type,
+    vkapi::DescriptorSet& descriptor_set,
+    const uint32_t idx) {
+  vTensorPtr tensor = get_tensor(ref);
+  if (tensor->buffer()) {
+    vkapi::VulkanBuffer& buffer = tensor->buffer(
+        pipeline_barrier, vkapi::PipelineStage::COMPUTE, access_type);
+    descriptor_set.bind(idx, buffer);
+  } else {
+    vkapi::VulkanImage& image = tensor->image(
+        pipeline_barrier, vkapi::PipelineStage::COMPUTE, access_type);
+    descriptor_set.bind(idx, image);
+  }
+}
+
+void ComputeGraph::bind_value_to_descriptor_set(
+    const ValueRef ref,
+    vkapi::PipelineBarrier& pipeline_barrier,
+    const vkapi::MemoryAccessFlags access_type,
+    vkapi::DescriptorSet& descriptor_set,
+    const uint32_t idx) {
+  if (val_is_tensor(ref)) {
+    bind_tensor_to_descriptor_set(
+        ref, pipeline_barrier, access_type, descriptor_set, idx);
+  } else if (val_is_staging(ref)) {
+    descriptor_set.bind(idx, get_staging(ref)->buffer());
+  }
+}
+
 void ComputeGraph::copy_into_staging(
     const ValueRef idx,
     const void* data,
@@ -891,6 +923,17 @@ void ComputeGraph::execute() {
   execute_count_++;
 }
 
+void ComputeGraph::virtual_clone(const ValueRef dst, const ValueRef src) {
+  get_tensor(dst)->virtual_clone(*get_tensor(src));
+}
+
+void ComputeGraph::virtual_transpose(
+    const ValueRef tensor,
+    const int64_t dim0,
+    const int64_t dim1) {
+  get_tensor(tensor)->virtual_transpose(dim0, dim1);
+}
+
 void ComputeGraph::resize_input(
     const int64_t idx,
     const std::vector<int64_t>& new_sizes) {
 
@@ -248,7 +248,16 @@ class ComputeGraph final {
     return values_.at(idx).is##type_name();                                \
   }
 
-  GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(vTensorPtr, tensor, Tensor)
+ protected:
+  inline vTensorPtr get_tensor(const ValueRef idx) {
+    return vTensorPtr(this, idx);
+  }
+
+ public:
+  inline bool val_is_tensor(const ValueRef idx) const {
+    return values_.at(idx).isTensor();
+  }
+
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(TensorRefPtr, tref, TensorRef)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(StagingPtr, staging, Staging)
   GET_AND_CHECK_VAL_AS_PTR_TYPE_FNS(IntListPtr, int_list, IntList)
@@ -319,6 +328,10 @@ class ComputeGraph final {
     return values_.at(idx).toConstTensor().numel();
   }
 
+  inline size_t staging_buffer_numel_of(const ValueRef idx) const {
+    return values_.at(idx).toConstTensor().staging_buffer_numel();
+  }
+
   inline utils::StorageType storage_type_of(const ValueRef idx) const {
     return values_.at(idx).toConstTensor().storage_type();
   }
@@ -832,6 +845,20 @@ class ComputeGraph final {
    */
   utils::uvec3 create_local_wg_size(const ValueRef idx);
 
+  void bind_tensor_to_descriptor_set(
+      const ValueRef ref,
+      vkapi::PipelineBarrier& pipeline_barrier,
+      const vkapi::MemoryAccessFlags accessType,
+      vkapi::DescriptorSet& descriptor_set,
+      const uint32_t idx);
+
+  void bind_value_to_descriptor_set(
+      const ValueRef ref,
+      vkapi::PipelineBarrier& pipeline_barrier,
+      const vkapi::MemoryAccessFlags access_type,
+      vkapi::DescriptorSet& descriptor_set,
+      const uint32_t idx);
+
   //
   // Input/Output
   //
@@ -890,14 +917,27 @@ class ComputeGraph final {
 
   void execute();
 
+  //
+  // Tensor View
+  //
+
+  void virtual_clone(const ValueRef dst, const ValueRef src);
+
+  void virtual_transpose(
+      const ValueRef tensor,
+      const int64_t dim0,
+      const int64_t dim1);
+
   //
   // Dynamic Shape support
   //
 
   void resize_input(const int64_t idx, const std::vector<int64_t>& new_sizes);
+
   void virtual_resize(
       const ValueRef idx,
       const std::vector<int64_t>& new_sizes);
+
   void propagate_resize();
 
   //
@@ -939,6 +979,8 @@ class ComputeGraph final {
   friend class SymIntPtr;
 
   friend struct TmpTensor;
+  friend struct SharedObject;
+  friend class BlitNode;
 };
 
 template <typename T>
 
@@ -26,11 +26,9 @@ BlitNode::BlitNode(
 }
 
 void BlitNode::encode(ComputeGraph* graph) {
-  auto src_tensor = graph->get_tensor(src_);
-  auto dst_tensor = graph->get_tensor(dst_);
   VK_CHECK_COND(
-      src_tensor->storage_type() != utils::kBuffer &&
-          dst_tensor->storage_type() != utils::kBuffer,
+      graph->storage_type_of(src_) != utils::kBuffer &&
+          graph->storage_type_of(dst_) != utils::kBuffer,
       "BlitNode: Only texture backed tensors are supported.");
 
   api::Context* const context = graph->context();
@@ -41,18 +39,18 @@ void BlitNode::encode(ComputeGraph* graph) {
   // Hack to get timing data for non shader op
   std::string kernel_name("Blit_");
   kernel_name.reserve(32);
-  kernel_name += vkapi::to_string(src_tensor->dtype());
+  kernel_name += vkapi::to_string(graph->dtype_of(src_));
   kernel_name += "_to_";
-  kernel_name += vkapi::to_string(dst_tensor->dtype());
+  kernel_name += vkapi::to_string(graph->dtype_of(dst_));
 
   context->report_shader_dispatch_start(
       kernel_name, utils::uvec3(), utils::WorkgroupSize(), node_id_);
 
   context->register_blit(
       pipeline_barrier,
-      src_tensor->image(
+      graph->get_tensor(src_)->image(
           pipeline_barrier, vkapi::PipelineStage::TRANSFER, vkapi::kRead),
-      dst_tensor->image(
+      graph->get_tensor(dst_)->image(
           pipeline_barrier, vkapi::PipelineStage::TRANSFER, vkapi::kWrite));
 
   context->report_shader_dispatch_end();
 
@@ -18,9 +18,8 @@ namespace vkcompute {
 
 vkapi::ShaderInfo get_noop_shader(ComputeGraph& graph, const ValueRef packed) {
   std::string noop_shader_name("no_op");
-  vTensorPtr t_packed = graph.get_tensor(packed);
-  add_dtype_suffix(noop_shader_name, *t_packed);
-  add_storage_type_suffix(noop_shader_name, *t_packed);
+  add_dtype_suffix(noop_shader_name, graph.dtype_of(packed));
+  add_storage_type_suffix(noop_shader_name, graph.storage_type_of(packed));
   return VK_KERNEL_FROM_STR(noop_shader_name);
 }
 
@@ -48,13 +47,13 @@ PrepackNode::PrepackNode(
 }
 
 api::StagingBuffer PrepackNode::create_staging_buffer(ComputeGraph* graph) {
-  vTensorPtr packed = graph->get_tensor(packed_);
-
-  // If no TensorRef is provided, create a staging buffer of zeros according to
-  // the vkapi::vTensor metadata.
+  // If no TensorRef is provided, create a staging buffer of zeros based on the
+  // Tensor metadata.
   if (graph->val_is_none(tref_)) {
-    size_t numel = utils::multiply_integers(packed->sizes());
-    api::StagingBuffer staging(graph->context(), packed->dtype(), numel);
+    const std::vector<int64_t> packed_sizes = graph->sizes_of(packed_);
+    size_t numel = utils::multiply_integers(packed_sizes);
+    api::StagingBuffer staging(
+        graph->context(), graph->dtype_of(packed_), numel);
     staging.set_staging_zeros();
     return staging;
   }
@@ -80,7 +79,6 @@ void PrepackNode::encode(ComputeGraph* graph) {
 
   context->check_device_capabilities(shader_);
 
-  vTensorPtr packed = graph->get_tensor(packed_);
   api::StagingBuffer staging = create_staging_buffer(graph);
 
   std::unique_lock<std::mutex> cmd_lock = context->dispatch_lock();
@@ -101,8 +99,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
         shader_, local_workgroup_size_, spec_vars_, push_constants_offset);
 
     uint32_t idx = 0;
-    bind_tensor_to_descriptor_set(
-        *packed,
+    graph->bind_tensor_to_descriptor_set(
+        packed_,
         pipeline_barrier,
         vkapi::MemoryAccessType::WRITE,
         descriptor_set,
@@ -128,8 +126,8 @@ void PrepackNode::encode(ComputeGraph* graph) {
     vkapi::DescriptorSet descriptor_set = context->get_descriptor_set(
         noop_shader_, utils::WorkgroupSize(1, 1, 1));
 
-    bind_tensor_to_descriptor_set(
-        *packed,
+    graph->bind_tensor_to_descriptor_set(
+        packed_,
         pipeline_barrier,
         vkapi::MemoryAccessType::READ,
         descriptor_set,