apache
diff --git a/‎cmake/modules/OpenCL.cmake‎
Lines changed: 3 additions & 0 deletions b/‎cmake/modules/OpenCL.cmake‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎include/tvm/relax/exec_builder.h‎
Lines changed: 9 additions & 0 deletions b/‎include/tvm/relax/exec_builder.h‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎include/tvm/runtime/tensor.h‎
Lines changed: 6 additions & 26 deletions b/‎include/tvm/runtime/tensor.h‎
Lines changed: 6 additions & 26 deletions
diff --git a/‎include/tvm/runtime/vm/executable.h‎
Lines changed: 12 additions & 0 deletions b/‎include/tvm/runtime/vm/executable.h‎
Lines changed: 12 additions & 0 deletions
diff --git a/‎python/tvm/dlight/adreno/convolution.py‎
Lines changed: 2 additions & 2 deletions b/‎python/tvm/dlight/adreno/convolution.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/tvm/dlight/adreno/fallback.py‎
Lines changed: 22 additions & 6 deletions b/‎python/tvm/dlight/adreno/fallback.py‎
Lines changed: 22 additions & 6 deletions
diff --git a/‎python/tvm/dlight/adreno/layout_transform.py‎
Lines changed: 1 addition & 1 deletion b/‎python/tvm/dlight/adreno/layout_transform.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎python/tvm/dlight/adreno/utils.py‎
Lines changed: 0 additions & 7 deletions b/‎python/tvm/dlight/adreno/utils.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎python/tvm/dlight/analysis/common_analysis.py‎
Lines changed: 23 additions & 24 deletions b/‎python/tvm/dlight/analysis/common_analysis.py‎
Lines changed: 23 additions & 24 deletions
diff --git a/‎src/relax/backend/vm/codegen_vm.cc‎
Lines changed: 4 additions & 4 deletions b/‎src/relax/backend/vm/codegen_vm.cc‎
Lines changed: 4 additions & 4 deletions
@@ -80,6 +80,9 @@ if(USE_OPENCL)
       message(STATUS "Set OpenCL Target version to " ${CMAKE_MATCH_1})
     endif()
   endif(USE_OPENCL_EXTN_QCOM)
+  if(PROFILE_SHADER_DUMP)
+    add_definitions(-DPROFILE_SHADER_DUMP)
+  endif(PROFILE_SHADER_DUMP)
 else()
   list(APPEND COMPILER_SRCS src/target/opt/build_opencl_off.cc)
 endif(USE_OPENCL)
@@ -122,6 +122,15 @@ class ExecBuilderNode : public Object {
     rv = value;
     return ConvertConstant_(rv);
   }
+  /*!
+   * \brief update memory scopes.
+   *
+   * This function builds the memory scopes for constants.
+   *
+   * \param Index of the constant
+   * \param The memory scope.
+   */
+  void SaveMemoryScope(vm::Instruction::Arg idx, ffi::String scope);
   /*!
    * \brief Raw access to underlying executable build in progress.
    */
 
@@ -36,7 +36,6 @@
 
 #include <atomic>
 #include <functional>
-#include <string>
 #include <utility>
 #include <vector>
 
@@ -189,25 +188,14 @@ class Tensor : public tvm::ffi::Tensor {
    */
   TVM_DLL static void CopyFromBytes(const DLTensor* to, void* from, size_t nbytes,
                                     TVMStreamHandle stream = nullptr);
-
-  TVM_DLL void SetScope(ffi::String scope);
-  TVM_DLL ffi::String GetScope() const;
-
- protected:
-  /*!
-   * \brief The memory scope
-   * represents the underlying scope information of device
-   */
-  ffi::String scope = "global";
 };
 
 /*!
  * \brief Save a DLTensor to stream
  * \param strm The output stream
  * \param tensor The tensor to be saved.
- * \param scope The tensor storage scope.
  */
-inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String scope = "global");
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor);
 
 inline void Tensor::CopyFrom(const DLTensor* other) {
   ICHECK(data_ != nullptr);
@@ -232,11 +220,10 @@ inline void Tensor::CopyTo(const Tensor& other) const {
 }
 
 /*! \brief Magic number for Tensor file */
-constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
-constexpr uint64_t kTVMNDArrayScopedMagic = 0xDD5E40F096B4A13E;
+constexpr uint64_t kTVMTensorMagic = 0xDD5E40F096B4A13F;
 
-inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String scope) {
-  uint64_t header = kTVMNDArrayScopedMagic, reserved = 0;
+inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor) {
+  uint64_t header = kTVMTensorMagic, reserved = 0;
   strm->Write(header);
   strm->Write(reserved);
   // Always save data as CPU context
@@ -256,7 +243,6 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String
   strm->Write(tensor->dtype);
   int ndim = tensor->ndim;
   strm->WriteArray(tensor->shape, ndim);
-  strm->Write(std::string(scope));
   int type_bytes = (tensor->dtype.bits + 7) / 8;
   int64_t num_elems = 1;
   for (int i = 0; i < ndim; ++i) {
@@ -280,14 +266,13 @@ inline bool SaveDLTensor(dmlc::Stream* strm, const DLTensor* tensor, ffi::String
   return true;
 }
 
-inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->(), GetScope()); }
+inline void Tensor::Save(dmlc::Stream* strm) const { SaveDLTensor(strm, operator->()); }
 
 inline bool Tensor::Load(dmlc::Stream* strm) {
   uint64_t header, reserved;
   ICHECK(strm->Read(&header)) << "Invalid DLTensor file format";
   ICHECK(strm->Read(&reserved)) << "Invalid DLTensor file format";
-  ICHECK((header == kTVMNDArrayMagic) || (header == kTVMNDArrayScopedMagic))
-      << "Invalid DLTensor file format";
+  ICHECK(header == kTVMTensorMagic) << "Invalid DLTensor file format";
   Device dev;
   int ndim;
   DLDataType dtype;
@@ -305,11 +290,6 @@ inline bool Tensor::Load(dmlc::Stream* strm) {
   for (int i = 0; i < ret->ndim; ++i) {
     num_elems *= ret->shape[i];
   }
-  if (header == kTVMNDArrayScopedMagic) {
-    std::string scope;
-    strm->Read(&scope);
-    ret.SetScope(scope);
-  }
   int64_t data_byte_size;
   ICHECK(strm->Read(&data_byte_size)) << "Invalid DLTensor file format";
   ICHECK(data_byte_size == num_elems * elem_bytes) << "Invalid DLTensor file format";
 
@@ -155,6 +155,8 @@ class VMExecutable : public ffi::ModuleObj {
   std::unordered_map<std::string, Index> func_map;
   /*! \brief The global constant pool. */
   std::vector<ffi::Any> constants;
+  /*! \brief The VDevice memory scopes */
+  std::unordered_map<Index, std::string> memory_scopes;
   /*! \brief The offset of instruction. */
   std::vector<Index> instr_offset;
   /*! \brief The byte data of instruction. */
@@ -177,6 +179,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void SaveGlobalSection(dmlc::Stream* strm) const;
+  /*!
+   * \brief Save the memory scopes.
+   * \param strm The output stream.
+   */
+  void SaveMemoryScopeSection(dmlc::Stream* strm) const;
   /*!
    * \brief Save the constant pool.
    * \param strm The input stream.
@@ -197,6 +204,11 @@ class VMExecutable : public ffi::ModuleObj {
    * \param strm The input stream.
    */
   void LoadGlobalSection(dmlc::Stream* strm);
+  /*!
+   * \brief Load the memory scopes.
+   * \param strm The input stream.
+   */
+  void LoadMemoryScopeSection(dmlc::Stream* strm);
   /*!
    * \brief Load the constant pool.
    * \param strm The input stream.
 
@@ -21,7 +21,7 @@
 from tvm import tir
 from tvm.target import Target
 
-from .utils import schedule_inline_blocks, schedule_storage_annotate, schedule_default
+from .utils import schedule_inline_blocks, schedule_default
 from .. import analysis
 from .base import AdrenoScheduleRule
 
@@ -102,6 +102,6 @@ def is_convolution(blk):
         Conv2d.schedule_conv2d(sch, conv_blk)
         remaining_blocks = schedule_inline_blocks(sch, remaining_blocks)
         schedule_default(sch, remaining_blocks)
-        schedule_storage_annotate(sch, remaining_blocks)
+        #schedule_storage_annotate(sch, remaining_blocks)
 
         return sch
@@ -26,6 +26,22 @@
 from .utils import get_texture_storage
 
 
+def _assert_gpu_target(target: Target):
+    if "gpu" not in target.keys:
+            raise ValueError(f"Expect a GPU target, but got {target}")
+
+
+def get_max_threads_per_block(target: Target) -> int:
+    _assert_gpu_target(target)
+    max_threads_per_block = None
+    for name in ["max_threads_per_block", "max_num_threads"]:
+        if max_threads_per_block is None:
+            max_threads_per_block = target.attrs.get(name, None)
+    if max_threads_per_block is None:
+        max_threads_per_block = 64
+    return int(max_threads_per_block)
+
+
 # pylint: disable=invalid-name,missing-function-docstring,unused-variable,unused-import
 class Fallback(AdrenoScheduleRule):
     """Texture Based Fallback Schedule(s) for Adreno"""
@@ -46,12 +62,12 @@ def schedule_inline_blocks(
         for blk in blocks:
             block_info = analysis.get_block_info(sch, blk)
             if block_info.is_injective() and not block_info.is_data_pad(sch):
-                if len(block_info.consumers) == 1:
+                if len(sch.get_consumers(blk)) == 1:
                     try:
                         sch.compute_inline(blk)
                     except Exception:  # pylint: disable=broad-exception-caught
                         remaining_blocks.append(blk)
-                elif len(block_info.producers) == 1:
+                elif len(sch.get_producers(blk)) == 1:
                     inlined_once = False
                     try:
                         # Would cause an issue inlining to producer with multiple consumers
@@ -76,15 +92,15 @@ def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
         block_info = analysis.get_block_info(sch, blk)
 
         s_loops, r_loops, o_loops = [], [], []
-        v_loop = block_info.write_bufs[0].assoc_lps[-1]
+        v_loop = block_info.write_bufs(sch)[0].assoc_lps[-1]
 
         for iter_info in block_info.iters:
             if sch.get(iter_info.loop_rv) == sch.get(v_loop):
                 continue
             {"S": s_loops, "R": r_loops, "O": o_loops}.get(iter_info.kind).append(iter_info.loop_rv)
 
         iter_vars = analysis.collect_block_iter_vars_used_in_access_region(
-            block_info.block_stmt, block_info.write_bufs[0].buf_region.region
+            sch.get(blk), block_info.write_bufs(sch)[0].buf_region.region
         )
         o_outer = [lp for lp in o_loops if sch.get(lp).var in iter_vars]
         o_inner = [lp for lp in o_loops if sch.get(lp).var not in iter_vars]
@@ -100,7 +116,7 @@ def schedule_default(sch: tir.Schedule, blk: tir.schedule.BlockRV):
         tgt = Target.current(allow_none=True)
 
         b = sch.fuse(*s_loops)
-        tx_extent = analysis.get_max_threads_per_block(tgt) if tgt is not None else 256
+        tx_extent = get_max_threads_per_block(tgt) if tgt is not None else 256
         bx, tx = sch.split(b, [None, tx_extent])
         sch.bind(bx, "blockIdx.x")
         sch.bind(tx, "threadIdx.x")
@@ -155,7 +171,7 @@ def apply(  # pylint: disable=too-many-locals
             return None
 
         block_infos = [analysis.get_block_info(sch, block) for block in blocks]
-        if not any("texture" in block.write_bufs[0].get_scope() for block in block_infos):
+        if not any("texture" in block.write_bufs(sch)[0].get_scope() for block in block_infos):
             return None
 
         Fallback.schedule_fallback(sch)
 
@@ -65,7 +65,7 @@ def apply(  # pylint: disable=too-many-locals
         ):
             return None
 
-        read_buf, write_buf = (block_info.read_bufs[0], block_info.write_bufs[0])
+        read_buf, write_buf = (block_info.read_bufs(sch)[0], block_info.write_bufs(sch)[0])
         lps = block_info.get_loops()
         lpv_read, lpv_write = (
             read_buf.assoc_lps[-1],
 
@@ -83,13 +83,6 @@ def schedule_default(sch, blocks: List[tir.schedule.BlockRV] = None):
     return ret
 
 
-def schedule_storage_annotate(sch: tir.Schedule, func=get_texture_storage):
-    # Check the Write Buffer isn't one of input Params and is Texturizable...
-    from .fallback import Fallback
-
-    return Fallback.schedule_annotate_storage(sch)
-
-
 def schedule_fallback(sch, blk):
     from .fallback import Fallback
 
 
@@ -64,6 +64,12 @@ def __repr__(self) -> str:
 
 
 get_blockrealize = get_global_func("tir.schedule.GetBlockRealize")
+# BufferIndex Types
+Index = namedtuple("Index", ["sub"])  # c
+RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
+DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
+MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
+BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
 
 
 # TODO: Shift Vlen Calculation here...
@@ -74,13 +80,6 @@ class BufferInfo:
     assoc_lps: List[Union[tir.schedule.LoopRV, None]]
     assoc_lps_info: List[Union[tir.For, None]]
 
-    # BufferIndex Types
-    Index = namedtuple("Index", ["sub"])  # c
-    RemIndex = namedtuple("RemIndex", ["sub", "div"])  # c%len
-    DivIndex = namedtuple("DivIndex", ["sub", "div"])  # c//len
-    MergeIndex = namedtuple("MulIndex", ["dom", "mul", "sub"])  # co*len + cb
-    BufIndex = List[Union[Index, RemIndex, DivIndex, MergeIndex, None]]
-
     def __init__(
         self,
         sch: tir.Schedule,
@@ -172,8 +171,6 @@ class BlockInfo:
     iters: List[IterInfo]
     block_rv: tir.schedule.BlockRV
     _reduction_block: bool
-    read_bufs: List[BufferInfo]
-    write_bufs: List[BufferInfo]
 
     def __init__(
         self,
@@ -192,6 +189,16 @@ def dom(self) -> List[Union[int, tir.PrimExpr]]:
         """The iteration domain of the block."""
         return [i.dom for i in self.iters]
 
+    def read_bufs(self, sch: tir.Schedule) -> List[BufferInfo]:
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(self.block_rv)
+        return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
+
+    def write_bufs(self, sch: tir.Schedule) -> List[BufferInfo]:
+        block_stmt = sch.get(self.block_rv)
+        lps = sch.get_loops(self.block_rv)
+        return [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
+
     def dom_kind(self) -> str:
         """The iteration domain kind of the block, for example, SSSS, SSSR."""
         return "".join(i.kind for i in self.iters)
@@ -216,7 +223,7 @@ def _check_unit_var_range(dom: ir.Range, var: tir.Var) -> bool:
         if len(r_region) != len(w_region):
             return False
         for var, r_dom, w_dom in zip(block.iter_vars, r_region, w_region):
-            if not _check_unit_var_range(var, r_dom) or not _check_unit_var_range(var, w_dom):
+            if not _check_unit_var_range(r_dom, var) or not _check_unit_var_range(w_dom, var):
                 return False
         return True
 
@@ -230,31 +237,23 @@ def is_reduction(self) -> bool:
 
     def is_layout_transform(self, sch: tir.Schedule) -> bool:
         """Whether the Block can be considered having a Layout Transform Pattern"""
-        block_stmt = sch.get(self.block_rv)
-        lps = sch.get_loops(block_rv)
-        read_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
-        write_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
         return (
             all(k == "S" for k in self.dom_kind())
-            and len(write_bufs) == 1
-            and len(read_bufs) == 1
+            and len(self.write_bufs(sch)) == 1
+            and len(self.read_bufs(sch)) == 1
             and not self.is_elementwise(sch)
             and not get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
         )
 
     def is_data_pad(self, sch: tir.Schedule) -> bool:
         """Whether the Block can be considered having a data pad pattern"""
-        block_stmt = sch.get(self.block_rv)
-        lps = sch.get_loops(block_rv)
-        read_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.reads]
-        write_bufs = [BufferInfo(sch, self.block_rv, buf, lps) for buf in block_stmt.writes]
         return (
             all(k == "S" for k in self.dom_kind())
-            and len(write_bufs) == 1
-            and len(read_bufs) == 1
+            and len(self.write_bufs(sch)) == 1
+            and len(self.read_bufs(sch)) == 1
             and not self.is_elementwise(sch)
-            and len(self.write_bufs[0].buf_region.region)
-            == len(self.read_bufs[0].buf_region.region)
+            and len(self.write_bufs(sch)[0].buf_region.region)
+            == len(self.read_bufs(sch)[0].buf_region.region)
             and get_global_func("tir.schedule.HasIfThenElse")(sch.get(self.block_rv))
         )
 
 
@@ -215,15 +215,15 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {
   }
 
   Instruction::Arg VisitExpr_(const ConstantNode* op) final {
+    auto arg = builder_->ConvertConstant(op->data);
+
     if (auto tsinfo = op->struct_info_.as<TensorStructInfoNode>()) {
       if (tsinfo->vdevice.defined()) {
         VDevice vdev = tsinfo->vdevice.value();
-        runtime::Tensor param = op->data;
-        param.SetScope(vdev->memory_scope);
+        builder_->SaveMemoryScope(arg, vdev->memory_scope);
       }
     }
-
-    return builder_->ConvertConstant(op->data);
+    return arg;
   }
 
   Instruction::Arg VisitExpr_(const ShapeExprNode* op) final {
Original file line number	Diff line number	Diff line change
`@@ -215,15 +215,15 @@ class CodeGenVM : public ExprFunctor<Instruction::Arg(const Expr&)> {`
`215`	`215`	`}`
`216`	`216`
`217`	`217`	`Instruction::Arg VisitExpr_(const ConstantNode* op) final {`
	`218`	`+ auto arg = builder_->ConvertConstant(op->data);`
	`219`	`+`
`218`	`220`	`if (auto tsinfo = op->struct_info_.as<TensorStructInfoNode>()) {`
`219`	`221`	`if (tsinfo->vdevice.defined()) {`
`220`	`222`	`VDevice vdev = tsinfo->vdevice.value();`
`221`		`- runtime::Tensor param = op->data;`
`222`		`- param.SetScope(vdev->memory_scope);`
	`223`	`+ builder_->SaveMemoryScope(arg, vdev->memory_scope);`
`223`	`224`	`}`
`224`	`225`	`}`
`225`		`-`
`226`		`- return builder_->ConvertConstant(op->data);`
	`226`	`+ return arg;`
`227`	`227`	`}`
`228`	`228`
`229`	`229`	`Instruction::Arg VisitExpr_(const ShapeExprNode* op) final {`