diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml
index 265de5bc2..a451d6529 100644
--- a/.github/workflows/presubmit.yml
+++ b/.github/workflows/presubmit.yml
@@ -4,51 +4,9 @@ on:
     types: [opened, synchronize, reopened]
 
 jobs:
-  title_format:
-    name: Check PR Title
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-
-      - name: Run PR Title Checker
-        run: |
-          pip install semver GitPython
-          python misc/ci_check_pr_title.py "$PR_TITLE"
-        env:
-          PR_TITLE: ${{ github.event.pull_request.title }}
-
-  check_code_format:
-    name: Check Code Format
-    runs-on: ubuntu-latest
-    # This job will be required to pass before merging to master branch.
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.8
-      - name: Check code format
-        run: |
-          git config user.email "taichigardener@gmail.com"
-          git config user.name "Taichi Gardener"
-          git checkout -b _fake_squash
-          git remote add upstream https://github.com/taichi-dev/taichi.git
-          git fetch upstream master
-          sudo apt-get install clang-format
-          python3 -m pip install --user yapf gitpython colorama isort
-          python3 python/taichi/code_format.py
-          git checkout -b _enforced_format
-          git commit -am "enforce code format" || true
-          # exit with 1 if there were differences:
-          git diff _fake_squash _enforced_format --exit-code
-
   build_and_test_cpu_required:
     # This job will be required to pass before merging to master branch.
     name: Required Build and Test (CPU)
-    needs: check_code_format
     strategy:
       matrix:
         include:
@@ -95,7 +53,6 @@ jobs:
 
   build_and_test_cpu:
     name: Build and Test (CPU)
-    needs: build_and_test_cpu_required
     strategy:
       matrix:
         include:
@@ -156,91 +113,3 @@ jobs:
           ti test -vr2 -t2
         env:
           RUN_CPP_TESTS: ${{ matrix.with_cpp_tests }}
-
-  build_and_test_gpu_linux:
-    name: Build and Test (GPU)
-    needs: check_code_format
-    runs-on: [self-hosted, cuda, cn]
-    steps:
-      - uses: actions/checkout@v2
-
-      - name: Build
-        run: |
-          git --version
-          export PATH=/home/github/taichi-llvm/bin/:$PATH
-          export CXX=clang++-8
-          export PYTHON=/usr/bin/python3
-          $PYTHON misc/ci_setup.py ci
-        env:
-          CI_SETUP_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF
-
-      - name: Test
-        run: |
-          export PYTHON=/usr/bin/python3
-          export PATH=/home/github/taichi-llvm/bin/:$PATH
-          export PATH=$PATH:$HOME/.local/bin
-          export DISPLAY=:1
-          hash -r
-          glewinfo
-          $PYTHON examples/algorithm/laplace.py
-          ti diagnose
-          ti test -vr2 -t2
-
-  build_and_test_windows:
-    name: Build and Test (Windows)
-    needs: check_code_format
-    runs-on: windows-latest
-    steps:
-      - name: Install 7Zip PowerShell
-        shell: powershell
-        run: Install-Module 7Zip4PowerShell -Force -Verbose
-
-      - uses: actions/checkout@v2
-      - uses: actions/setup-python@v2
-        with:
-          python-version: 3.7
-
-      - name: Add msbuild to PATH
-        uses: microsoft/setup-msbuild@v1.0.2
-
-      - name: Build
-        shell: powershell
-        run: |
-          $env:TAICHI_REPO_DIR = "D:\a\taichi\taichi"
-          $env:PYTHONPATH = "$env:TAICHI_REPO_DIR\python"
-          cd C:\
-          Remove-item alias:curl
-          curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO
-          7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm
-          curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO
-          7z x clang-10.0.0-win.zip -otaichi_clang
-          $env:PATH += ";C:\taichi_llvm\bin"
-          $env:PATH += ";C:\taichi_clang\bin"
-          $env:PATH += ";$env:TAICHI_REPO_DIR\bin"
-          clang --version
-          cd D:\a\taichi\taichi
-          python -m pip install numpy
-          python -m pip install pybind11
-          python misc/ci_setup.py ci
-          mkdir build
-          cd build
-          cmake .. -G"Visual Studio 16 2019" -A x64 -DPYTHON_EXECUTABLE="$env:PYTHON" -DLLVM_DIR="C:\taichi_llvm\lib\cmake\llvm"
-          msbuild /p:Configuration=RelWithDebInfo /p:Platform=x64 /m taichi.sln
-          cd ..
-        env:
-          PYTHON: C:\hostedtoolcache\windows\Python\3.7.9\x64\python.exe
-
-      - name: Test
-        shell: powershell
-        run: |
-          $env:TAICHI_REPO_DIR = "D:\a\taichi\taichi"
-          $env:PYTHONPATH = "$env:TAICHI_REPO_DIR\python"
-          $env:PATH += ";C:\taichi_llvm\bin"
-          $env:PATH += ";C:\taichi_clang\bin"
-          $env:PATH += ";$env:TAICHI_REPO_DIR\bin"
-          python -c "import taichi"
-          python examples/algorithm/laplace.py
-          python bin/taichi diagnose
-          python bin/taichi test -Cvr2 -t2
-        env:
-          PYTHON: C:\hostedtoolcache\windows\Python\3.7.9\x64\python.exe
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 181ed4b34..990ff3489 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -94,8 +94,8 @@ endif()
 foreach(arch IN LISTS HOST_ARCH CUDA_ARCH)
   add_custom_target(
       "generate_llvm_runtime_${arch}"
-      COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
-      COMMAND ${LLVM_AS_EXECUTABLE} runtime.ll -o "runtime_${arch}.bc"
+      COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime_${arch}.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR};
+      COMMAND ${LLVM_AS_EXECUTABLE} runtime_${arch}.ll -o "runtime_${arch}.bc"
       WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm"
   )
   add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}")
diff --git a/taichi/analysis/bls_analyzer.cpp b/taichi/analysis/bls_analyzer.cpp
index d2105daca..868df66ab 100644
--- a/taichi/analysis/bls_analyzer.cpp
+++ b/taichi/analysis/bls_analyzer.cpp
@@ -25,8 +25,7 @@ void BLSAnalyzer::generate_block_indices(SNode *snode, BlockIndices *indices) {
   // NOTE: Assuming not vectorized
   for (int i = 0; i < snode->num_active_indices; i++) {
     auto j = snode->physical_index_position[i];
-    indices->push_back(
-        {/*low=*/0, /*high=*/(1 << snode->extractors[j].num_bits) - 1});
+    indices->push_back({/*low=*/0, /*high=*/snode->extractors[j].shape - 1});
   }
 }
 
diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp
index fc48b7f01..6b311564d 100644
--- a/taichi/backends/metal/kernel_manager.cpp
+++ b/taichi/backends/metal/kernel_manager.cpp
@@ -816,9 +816,10 @@ class KernelManager::Impl {
         const auto &ext = sn->extractors[j];
         rtm_ext->extractors[j].num_bits = ext.num_bits;
         rtm_ext->extractors[j].acc_offset = ext.acc_offset;
-        rtm_ext->extractors[j].num_elements = ext.num_elements;
-        TI_DEBUG("  [{}] num_bits={} acc_offset={} num_elements={}", j,
-                 ext.num_bits, ext.acc_offset, ext.num_elements);
+        rtm_ext->extractors[j].num_elements_from_root =
+            ext.num_elements_from_root;
+        TI_DEBUG("  [{}] num_bits={} acc_offset={} num_elements_from_root={}",
+                 j, ext.num_bits, ext.acc_offset, ext.num_elements_from_root);
       }
       TI_DEBUG("");
     }
diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h
index 171e8463f..9b3347809 100644
--- a/taichi/backends/metal/shaders/runtime_structs.metal.h
+++ b/taichi/backends/metal/shaders/runtime_structs.metal.h
@@ -117,7 +117,7 @@ STR(
         int32_t start = 0;
         int32_t num_bits = 0;
         int32_t acc_offset = 0;
-        int32_t num_elements = 0;
+        int32_t num_elements_from_root = 0;
       };
 
       Extractor extractors[kTaichiMaxNumIndices];
diff --git a/taichi/backends/metal/struct_metal.cpp b/taichi/backends/metal/struct_metal.cpp
index 8504e046b..f0c654693 100644
--- a/taichi/backends/metal/struct_metal.cpp
+++ b/taichi/backends/metal/struct_metal.cpp
@@ -337,7 +337,7 @@ class StructCompiler {
     }
     sn_desc.total_num_elems_from_root = 1;
     for (const auto &e : sn->extractors) {
-      sn_desc.total_num_elems_from_root *= e.num_elements;
+      sn_desc.total_num_elems_from_root *= e.num_elements_from_root;
     }
 
     TI_ASSERT(snode_descriptors_.find(sn->id) == snode_descriptors_.end());
diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp
index 59d4b9335..3e1bd8c05 100644
--- a/taichi/codegen/codegen_llvm.cpp
+++ b/taichi/codegen/codegen_llvm.cpp
@@ -1673,15 +1673,18 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) {
 
     auto coord_object = RuntimeObject(kLLVMPhysicalCoordinatesName, this,
                                       builder.get(), new_coordinates);
-    for (int i = 0; i < snode->num_active_indices; i++) {
-      auto j = snode->physical_index_position[i];
-      if (!bit::is_power_of_two(snode->extractors[j].num_elements)) {
-        auto coord = coord_object.get("val", tlctx->get_constant(j));
-        exec_cond = builder->CreateAnd(
-            exec_cond,
-            builder->CreateICmp(
-                llvm::CmpInst::ICMP_SLT, coord,
-                tlctx->get_constant(snode->extractors[j].num_elements)));
+    if (!prog->config.packed) {
+      for (int i = 0; i < snode->num_active_indices; i++) {
+        auto j = snode->physical_index_position[i];
+        if (!bit::is_power_of_two(
+                snode->extractors[j].num_elements_from_root)) {
+          auto coord = coord_object.get("val", tlctx->get_constant(j));
+          exec_cond = builder->CreateAnd(
+              exec_cond, builder->CreateICmp(
+                             llvm::CmpInst::ICMP_SLT, coord,
+                             tlctx->get_constant(
+                                 snode->extractors[j].num_elements_from_root)));
+        }
       }
     }
 
diff --git a/taichi/ir/scratch_pad.h b/taichi/ir/scratch_pad.h
index 7081be117..7df97db99 100644
--- a/taichi/ir/scratch_pad.h
+++ b/taichi/ir/scratch_pad.h
@@ -103,8 +103,7 @@ class ScratchPad {
     block_size.resize(dim);
     for (int i = 0; i < dim; i++) {
       block_size[i] =
-          1 << snode->parent->extractors[snode->physical_index_position[i]]
-                   .num_bits;
+          snode->parent->extractors[snode->physical_index_position[i]].shape;
       TI_ASSERT(bounds[i].low != std::numeric_limits<int>::max());
       TI_ASSERT(bounds[i].high != std::numeric_limits<int>::min());
     }
diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp
index 2bdf25156..60d6da07a 100644
--- a/taichi/ir/snode.cpp
+++ b/taichi/ir/snode.cpp
@@ -2,6 +2,7 @@
 
 #include "taichi/ir/ir.h"
 #include "taichi/ir/statements.h"
+#include "taichi/program/program.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -34,23 +35,13 @@ SNode &SNode::create_node(std::vector<Index> indices,
                    "hashed node must be child of root due to initialization "
                    "memset limitation.");
   auto &new_node = insert_children(type);
-  new_node.n = 1;
-  for (int i = 0; i < sizes.size(); i++) {
-    auto s = sizes[i];
-    TI_ASSERT(sizes[i] > 0);
-    if (!bit::is_power_of_two(s)) {
-      auto promoted_s = bit::least_pot_bound(s);
-      TI_DEBUG("Non-power-of-two node size {} promoted to {}.", s, promoted_s);
-      s = promoted_s;
-    }
-    TI_ASSERT(bit::is_power_of_two(s));
-    new_node.n *= s;
-  }
   for (int i = 0; i < (int)indices.size(); i++) {
+    TI_ASSERT(sizes[i] > 0);
     auto &ind = indices[i];
     new_node.extractors[ind.value].activate(
         bit::log2int(bit::least_pot_bound(sizes[i])));
-    new_node.extractors[ind.value].num_elements = sizes[i];
+    new_node.extractors[ind.value].shape = sizes[i];
+    new_node.extractors[ind.value].num_elements_from_root = sizes[i];
   }
   return new_node;
 }
@@ -99,7 +90,7 @@ SNode *SNode::get_least_sparse_ancestor() const {
 
 int SNode::shape_along_axis(int i) const {
   const auto &extractor = extractors[physical_index_position[i]];
-  return extractor.num_elements;
+  return extractor.num_elements_from_root;
 }
 
 SNode::SNode() : SNode(0, SNodeType::undefined) {
diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h
index 5740e0d27..060acc332 100644
--- a/taichi/ir/snode.h
+++ b/taichi/ir/snode.h
@@ -34,26 +34,34 @@ class Index {
  */
 struct IndexExtractor {
   /**
-   * Shape at the given index.
+   * Number of elements from root at this index.
    *
-   * This is the raw shape, *not* padded to power-of-two (POT).
+   * This is the raw number, *not* padded to power-of-two (POT).
    */
-  int num_elements{1};
+  int num_elements_from_root{1};
+  /**
+   * Shape at this index (POT or packed) according to the config.
+   */
+  int shape{1};
+  /**
+   * Accumulated shape from the last activated index to the first one.
+   */
+  int acc_shape{1};
   /**
    * Number of bits needed to store the coordinate at this index.
    *
-   * ceil(log2(num_elements))
+   * ceil(log2(shape))
    */
   int num_bits{0};
   /**
    * Accumulated offset from the last activated index to the first one.
    *
-   * This is the starting bit of this index in a linearized 1D coordiate. For
+   * This is the starting bit of this index in a linearized 1D coordinate. For
    * example, assuming an SNode of (ti.ijk, shape=(4, 8, 16)). ti.i takes 2
    * bits, ti.j 3 bits and ti.k 4 bits. Then for a linearized coordinate:
-   * ti.k uses bits [0, 3), acc_offset=0
-   * tk.j uses btis [3, 6), acc_offset=3
-   * ti.i uses bits [6, 8), acc_offset=6
+   * ti.k uses bits [0, 4), acc_offset=0
+   * ti.j uses bits [4, 7), acc_offset=4
+   * ti.i uses bits [7, 9), acc_offset=7
    */
   int acc_offset{0};
   /**
@@ -106,7 +114,7 @@ class SNode {
   int depth{0};
 
   std::string name;
-  int64 n{0};
+  int64 n{1};
   int total_num_bits{0};
   int total_bit_start{0};
   int chunk_size{0};
@@ -283,7 +291,7 @@ class SNode {
   }
 
   int64 max_num_elements() const {
-    return int64(1) << total_num_bits;
+    return n;
   }
 
   int shape_along_axis(int i) const;
diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h
index 5b6059916..6b22cc475 100644
--- a/taichi/ir/transforms.h
+++ b/taichi/ir/transforms.h
@@ -106,7 +106,7 @@ bool replace_and_insert_statements(
 bool replace_statements(IRNode *root,
                         std::function<bool(Stmt *)> filter,
                         std::function<Stmt *(Stmt *)> finder);
-void demote_dense_struct_fors(IRNode *root);
+void demote_dense_struct_fors(IRNode *root, bool packed);
 bool demote_atomics(IRNode *root, const CompileConfig &config);
 void reverse_segments(IRNode *root);  // for autograd
 void detect_read_only(IRNode *root);
diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp
index a06805f48..9e126b589 100644
--- a/taichi/program/compile_config.cpp
+++ b/taichi/program/compile_config.cpp
@@ -8,6 +8,7 @@ CompileConfig::CompileConfig() {
   arch = host_arch();
   simd_width = default_simd_width(arch);
   external_optimization_level = 3;
+  packed = false;
   print_ir = false;
   print_accessor_ir = false;
   print_evaluator_ir = false;
diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h
index e42d4e265..3c2d9b465 100644
--- a/taichi/program/compile_config.h
+++ b/taichi/program/compile_config.h
@@ -14,6 +14,7 @@ struct CompileConfig {
   bool lazy_compilation;
   int external_optimization_level;
   int max_vector_width;
+  bool packed;
   bool print_ir;
   bool print_accessor_ir;
   bool print_evaluator_ir;
diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp
index 0b11a2061..b8c88894e 100644
--- a/taichi/program/program.cpp
+++ b/taichi/program/program.cpp
@@ -429,7 +429,7 @@ void Program::initialize_llvm_runtime_snodes(const SNodeTree *tree,
 
 int Program::add_snode_tree(std::unique_ptr<SNode> root) {
   const int id = snode_trees_.size();
-  auto tree = std::make_unique<SNodeTree>(id, std::move(root));
+  auto tree = std::make_unique<SNodeTree>(id, std::move(root), config.packed);
   tree->root()->set_snode_tree_id(id);
   materialize_snode_tree(tree.get());
   snode_trees_.push_back(std::move(tree));
diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp
index ff94466ec..ed469c619 100644
--- a/taichi/python/export_lang.cpp
+++ b/taichi/python/export_lang.cpp
@@ -119,6 +119,7 @@ void export_lang(py::module &m) {
   py::class_<CompileConfig>(m, "CompileConfig")
       .def(py::init<>())
       .def_readwrite("arch", &CompileConfig::arch)
+      .def_readwrite("packed", &CompileConfig::packed)
       .def_readwrite("print_ir", &CompileConfig::print_ir)
       .def_readwrite("debug", &CompileConfig::debug)
       .def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization)
diff --git a/taichi/struct/snode_tree.cpp b/taichi/struct/snode_tree.cpp
index f26a48543..4893b66ab 100644
--- a/taichi/struct/snode_tree.cpp
+++ b/taichi/struct/snode_tree.cpp
@@ -5,9 +5,9 @@
 namespace taichi {
 namespace lang {
 
-SNodeTree::SNodeTree(int id, std::unique_ptr<SNode> root)
+SNodeTree::SNodeTree(int id, std::unique_ptr<SNode> root, bool packed)
     : id_(id), root_(std::move(root)) {
-  infer_snode_properties(*root_);
+  infer_snode_properties(*root_, packed);
 }
 
 }  // namespace lang
diff --git a/taichi/struct/snode_tree.h b/taichi/struct/snode_tree.h
index 58f58f2d3..30ef34dc4 100644
--- a/taichi/struct/snode_tree.h
+++ b/taichi/struct/snode_tree.h
@@ -22,7 +22,7 @@ class SNodeTree {
    * @param id Id of the tree
    * @param root Root of the tree
    */
-  explicit SNodeTree(int id, std::unique_ptr<SNode> root);
+  explicit SNodeTree(int id, std::unique_ptr<SNode> root, bool packed);
 
   int id() const {
     return id_;
diff --git a/taichi/struct/struct.cpp b/taichi/struct/struct.cpp
index 01115c79b..d4b8355e7 100644
--- a/taichi/struct/struct.cpp
+++ b/taichi/struct/struct.cpp
@@ -9,12 +9,13 @@
 namespace taichi {
 namespace lang {
 
-void infer_snode_properties(SNode &snode) {
+void infer_snode_properties(SNode &snode, bool packed) {
   for (int ch_id = 0; ch_id < (int)snode.ch.size(); ch_id++) {
     auto &ch = snode.ch[ch_id];
     ch->parent = &snode;
     for (int i = 0; i < taichi_max_num_indices; i++) {
-      ch->extractors[i].num_elements *= snode.extractors[i].num_elements;
+      ch->extractors[i].num_elements_from_root *=
+          snode.extractors[i].num_elements_from_root;
       bool found = false;
       for (int k = 0; k < taichi_max_num_indices; k++) {
         if (snode.physical_index_position[k] == i) {
@@ -40,15 +41,27 @@ void infer_snode_properties(SNode &snode) {
       ch->is_bit_level = snode.is_bit_level;
     }
 
-    infer_snode_properties(*ch);
+    infer_snode_properties(*ch, packed);
   }
 
   // infer extractors
+  int acc_shape = 1;
+  for (int i = taichi_max_num_indices - 1; i >= 0; i--) {
+    // if not in packed mode, pad shape to POT
+    if (!packed) {
+      snode.extractors[i].shape = 1 << snode.extractors[i].num_bits;
+    }
+    snode.extractors[i].acc_shape = acc_shape;
+    acc_shape *= snode.extractors[i].shape;
+  }
+  snode.n = acc_shape;
+  // infer extractors (only for POT)
   int acc_offsets = 0;
   for (int i = taichi_max_num_indices - 1; i >= 0; i--) {
     snode.extractors[i].acc_offset = acc_offsets;
     acc_offsets += snode.extractors[i].num_bits;
   }
+  snode.total_num_bits = acc_offsets;
   if (snode.type == SNodeType::dynamic) {
     int active_extractor_counder = 0;
     for (int i = 0; i < taichi_max_num_indices; i++) {
@@ -67,10 +80,6 @@ void infer_snode_properties(SNode &snode) {
                    "Dynamic SNode can have only one index extractor.");
   }
 
-  snode.total_num_bits = 0;
-  for (int i = 0; i < taichi_max_num_indices; i++) {
-    snode.total_num_bits += snode.extractors[i].num_bits;
-  }
   // The highest bit is for the sign.
   constexpr int kMaxTotalNumBits = 64;
   TI_ERROR_IF(
diff --git a/taichi/struct/struct.h b/taichi/struct/struct.h
index 7883fd12b..dd7be4af8 100644
--- a/taichi/struct/struct.h
+++ b/taichi/struct/struct.h
@@ -12,7 +12,7 @@ namespace lang {
  *
  * @param snode The root SNode to compute.
  */
-void infer_snode_properties(SNode &snode);
+void infer_snode_properties(SNode &snode, bool packed);
 
 class StructCompiler {
  public:
diff --git a/taichi/struct/struct_llvm.cpp b/taichi/struct/struct_llvm.cpp
index c8246271d..5c32b8681 100644
--- a/taichi/struct/struct_llvm.cpp
+++ b/taichi/struct/struct_llvm.cpp
@@ -192,20 +192,39 @@ void StructCompilerLLVM::generate_refine_coordinates(SNode *snode) {
   auto outp_coords = args[1];
   auto l = args[2];
 
-  for (int i = 0; i < taichi_max_num_indices; i++) {
-    auto addition = tlctx_->get_constant(0);
-    if (snode->extractors[i].num_bits) {
-      auto mask = ((1 << snode->extractors[i].num_bits) - 1);
-      addition = builder.CreateAnd(
-          builder.CreateAShr(l, snode->extractors[i].acc_offset), mask);
+  if (config_->packed) {  // no dependence on POT
+    for (int i = 0; i < taichi_max_num_indices; i++) {
+      auto addition = tlctx_->get_constant(0);
+      if (snode->extractors[i].shape > 1) {
+        auto prev = tlctx_->get_constant(snode->extractors[i].acc_shape *
+                                         snode->extractors[i].shape);
+        auto next = tlctx_->get_constant(snode->extractors[i].acc_shape);
+        addition = builder.CreateSDiv(builder.CreateSRem(l, prev), next);
+      }
+      auto in = call(&builder, "PhysicalCoordinates_get_val", inp_coords,
+                     tlctx_->get_constant(i));
+      in = builder.CreateMul(in,
+                             tlctx_->get_constant(snode->extractors[i].shape));
+      auto added = builder.CreateAdd(in, addition);
+      call(&builder, "PhysicalCoordinates_set_val", outp_coords,
+           tlctx_->get_constant(i), added);
+    }
+  } else {
+    for (int i = 0; i < taichi_max_num_indices; i++) {
+      auto addition = tlctx_->get_constant(0);
+      if (snode->extractors[i].num_bits) {
+        auto mask = ((1 << snode->extractors[i].num_bits) - 1);
+        addition = builder.CreateAnd(
+            builder.CreateAShr(l, snode->extractors[i].acc_offset), mask);
+      }
+      auto in = call(&builder, "PhysicalCoordinates_get_val", inp_coords,
+                     tlctx_->get_constant(i));
+      in = builder.CreateShl(
+          in, tlctx_->get_constant(snode->extractors[i].num_bits));
+      auto added = builder.CreateOr(in, addition);
+      call(&builder, "PhysicalCoordinates_set_val", outp_coords,
+           tlctx_->get_constant(i), added);
     }
-    auto in = call(&builder, "PhysicalCoordinates_get_val", inp_coords,
-                   tlctx_->get_constant(i));
-    in = builder.CreateShl(in,
-                           tlctx_->get_constant(snode->extractors[i].num_bits));
-    auto added = builder.CreateOr(in, addition);
-    call(&builder, "PhysicalCoordinates_set_val", outp_coords,
-         tlctx_->get_constant(i), added);
   }
   builder.CreateRetVoid();
 }
diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp
index 367b7f2cb..24778f996 100644
--- a/taichi/transforms/compile_to_offloads.cpp
+++ b/taichi/transforms/compile_to_offloads.cpp
@@ -173,7 +173,7 @@ void offload_to_executable(IRNode *ir,
   irpass::analysis::verify(ir);
 
   if (config.demote_dense_struct_fors) {
-    irpass::demote_dense_struct_fors(ir);
+    irpass::demote_dense_struct_fors(ir, config.packed);
     irpass::type_check(ir, config);
     print("Dense struct-for demoted");
     irpass::analysis::verify(ir);
diff --git a/taichi/transforms/demote_dense_struct_fors.cpp b/taichi/transforms/demote_dense_struct_fors.cpp
index 23656aa74..2bc995547 100644
--- a/taichi/transforms/demote_dense_struct_fors.cpp
+++ b/taichi/transforms/demote_dense_struct_fors.cpp
@@ -2,6 +2,7 @@
 #include "taichi/ir/statements.h"
 #include "taichi/ir/transforms.h"
 #include "taichi/ir/visitors.h"
+#include "taichi/transforms/utils.h"
 
 TLANG_NAMESPACE_BEGIN
 
@@ -9,7 +10,7 @@ namespace {
 
 using TaskType = OffloadedStmt::TaskType;
 
-void convert_to_range_for(OffloadedStmt *offloaded) {
+void convert_to_range_for(OffloadedStmt *offloaded, bool packed) {
   TI_ASSERT(offloaded->task_type == TaskType::struct_for);
 
   std::vector<SNode *> snodes;
@@ -27,10 +28,21 @@ void convert_to_range_for(OffloadedStmt *offloaded) {
   std::reverse(snodes.begin(), snodes.end());
   TI_ASSERT(total_bits <= 30);
 
+  // general shape calculation - no dependence on POT
+  int total_n = 1;
+  std::array<int, taichi_max_num_indices> total_shape;
+  total_shape.fill(1);
+  for (const auto *s : snodes) {
+    for (int j = 0; j < taichi_max_num_indices; j++) {
+      total_shape[j] *= s->extractors[j].shape;
+    }
+    total_n *= s->n;
+  }
+
   offloaded->const_begin = true;
   offloaded->const_end = true;
   offloaded->begin_value = 0;
-  offloaded->end_value = 1 << total_bits;
+  offloaded->end_value = total_n;
 
   ////// Begin core transformation
   auto body = std::move(offloaded->body);
@@ -51,47 +63,70 @@ void convert_to_range_for(OffloadedStmt *offloaded) {
   auto main_loop_var = body_header.push_back<LoopIndexStmt>(nullptr, 0);
   // We will set main_loop_var->loop later.
 
-  int offset = total_bits;
-  int start_bits[taichi_max_num_indices] = {0};
-  std::copy(std::begin(start_bits_root), std::end(start_bits_root),
-            std::begin(start_bits));
   Stmt *test = body_header.push_back<ConstStmt>(TypedConstant(-1));
   bool has_test = false;
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    auto snode = snodes[i];
-    offset -= snode->total_num_bits;
-    for (int j = 0; j < (int)physical_indices.size(); j++) {
-      auto p = physical_indices[j];
-      auto ext = snode->extractors[p];
-      Stmt *delta = body_header.push_back<BitExtractStmt>(
-          main_loop_var, ext.acc_offset + offset,
-          ext.acc_offset + offset + ext.num_bits);
-      start_bits[p] -= ext.num_bits;
-      auto multiplier =
-          body_header.push_back<ConstStmt>(TypedConstant(1 << start_bits[p]));
-      delta = body_header.push_back<BinaryOpStmt>(BinaryOpType::mul, delta,
-                                                  multiplier);
-      new_loop_vars[j] = body_header.push_back<BinaryOpStmt>(
-          BinaryOpType::add, new_loop_vars[j], delta);
+  if (packed) {  // no dependence on POT
+    for (int i = 0; i < (int)snodes.size(); i++) {
+      auto snode = snodes[i];
+      auto extracted = generate_mod_x_div_y(&body_header, main_loop_var,
+                                            total_n, total_n / snode->n);
+      total_n /= snode->n;
+      for (int j = 0; j < (int)physical_indices.size(); j++) {
+        auto p = physical_indices[j];
+        auto ext = snode->extractors[p];
+        auto index = generate_mod_x_div_y(
+            &body_header, extracted, ext.acc_shape * ext.shape, ext.acc_shape);
+        total_shape[p] /= ext.shape;
+        auto multiplier =
+            body_header.push_back<ConstStmt>(TypedConstant(total_shape[p]));
+        auto delta = body_header.push_back<BinaryOpStmt>(BinaryOpType::mul,
+                                                         index, multiplier);
+        new_loop_vars[j] = body_header.push_back<BinaryOpStmt>(
+            BinaryOpType::add, new_loop_vars[j], delta);
+      }
+    }
+  } else {
+    int offset = total_bits;
+    int start_bits[taichi_max_num_indices] = {0};
+    std::copy(std::begin(start_bits_root), std::end(start_bits_root),
+              std::begin(start_bits));
+    for (int i = 0; i < (int)snodes.size(); i++) {
+      auto snode = snodes[i];
+      offset -= snode->total_num_bits;
+      for (int j = 0; j < (int)physical_indices.size(); j++) {
+        auto p = physical_indices[j];
+        auto ext = snode->extractors[p];
+        Stmt *delta = body_header.push_back<BitExtractStmt>(
+            main_loop_var, ext.acc_offset + offset,
+            ext.acc_offset + offset + ext.num_bits);
+        start_bits[p] -= ext.num_bits;
+        auto multiplier =
+            body_header.push_back<ConstStmt>(TypedConstant(1 << start_bits[p]));
+        delta = body_header.push_back<BinaryOpStmt>(BinaryOpType::mul, delta,
+                                                    multiplier);
+        new_loop_vars[j] = body_header.push_back<BinaryOpStmt>(
+            BinaryOpType::add, new_loop_vars[j], delta);
+      }
     }
-  }
 
-  std::copy(std::begin(start_bits_root), std::end(start_bits_root),
-            std::begin(start_bits));
-  for (int i = 0; i < (int)snodes.size(); i++) {
-    auto snode = snodes[i];
-    for (int j = 0; j < (int)physical_indices.size(); j++) {
-      auto p = physical_indices[j];
-      start_bits[p] -= snode->extractors[p].num_bits;
-      auto num_elements = snode->extractors[p].num_elements << start_bits[p];
-      if (!bit::is_power_of_two(num_elements)) {
-        has_test = true;
-        auto bound =
-            body_header.push_back<ConstStmt>(TypedConstant(num_elements));
-        auto cmp = body_header.push_back<BinaryOpStmt>(BinaryOpType::cmp_lt,
-                                                       new_loop_vars[j], bound);
-        test = body_header.push_back<BinaryOpStmt>(BinaryOpType::bit_and, test,
-                                                   cmp);
+    std::copy(std::begin(start_bits_root), std::end(start_bits_root),
+              std::begin(start_bits));
+    for (int i = 0; i < (int)snodes.size(); i++) {
+      auto snode = snodes[i];
+      for (int j = 0; j < (int)physical_indices.size(); j++) {
+        auto p = physical_indices[j];
+        start_bits[p] -= snode->extractors[p].num_bits;
+        auto num_elements = snode->extractors[p].num_elements_from_root
+                            << start_bits[p];
+        if (!bit::is_power_of_two(num_elements)) {
+          has_test = true;
+          auto bound =
+              body_header.push_back<ConstStmt>(TypedConstant(num_elements));
+          auto cmp = body_header.push_back<BinaryOpStmt>(
+              BinaryOpType::cmp_lt, new_loop_vars[j], bound);
+          test = body_header.push_back<BinaryOpStmt>(BinaryOpType::bit_and,
+                                                     test, cmp);
+        }
       }
     }
   }
@@ -131,10 +166,10 @@ void convert_to_range_for(OffloadedStmt *offloaded) {
   offloaded->task_type = TaskType::range_for;
 }
 
-void maybe_convert(OffloadedStmt *stmt) {
+void maybe_convert(OffloadedStmt *stmt, bool packed) {
   if ((stmt->task_type == TaskType::struct_for) &&
       stmt->snode->is_path_all_dense) {
-    convert_to_range_for(stmt);
+    convert_to_range_for(stmt, packed);
   }
 }
 
@@ -142,15 +177,15 @@ void maybe_convert(OffloadedStmt *stmt) {
 
 namespace irpass {
 
-void demote_dense_struct_fors(IRNode *root) {
+void demote_dense_struct_fors(IRNode *root, bool packed) {
   if (auto *block = root->cast<Block>()) {
     for (auto &s_ : block->statements) {
       if (auto *s = s_->cast<OffloadedStmt>()) {
-        maybe_convert(s);
+        maybe_convert(s, packed);
       }
     }
   } else if (auto *s = root->cast<OffloadedStmt>()) {
-    maybe_convert(s);
+    maybe_convert(s, packed);
   }
   re_id(root);
 }
diff --git a/taichi/transforms/lower_access.cpp b/taichi/transforms/lower_access.cpp
index 299bfe53b..c2c44562a 100644
--- a/taichi/transforms/lower_access.cpp
+++ b/taichi/transforms/lower_access.cpp
@@ -46,11 +46,14 @@ class LowerAccess : public IRVisitor {
   StructForStmt *current_struct_for;
   const std::vector<SNode *> &kernel_forces_no_activate;
   bool lower_atomic_ptr;
+  bool packed;
 
   LowerAccess(const std::vector<SNode *> &kernel_forces_no_activate,
-              bool lower_atomic_ptr)
+              bool lower_atomic_ptr,
+              bool packed)
       : kernel_forces_no_activate(kernel_forces_no_activate),
-        lower_atomic_ptr(lower_atomic_ptr) {
+        lower_atomic_ptr(lower_atomic_ptr),
+        packed(packed) {
     // TODO: change this to false
     allow_undefined_visitor = true;
     current_struct_for = nullptr;
@@ -99,8 +102,8 @@ class LowerAccess : public IRVisitor {
       TI_ASSERT(!pointer_needs_activation);
     }
 
-    PtrLowererImpl lowerer{leaf_snode, indices, snode_op, is_bit_vectorized,
-                           lowered};
+    PtrLowererImpl lowerer{leaf_snode,        indices, snode_op,
+                           is_bit_vectorized, lowered, packed};
     lowerer.set_pointer_needs_activation(pointer_needs_activation);
     lowerer.set_lower_access(this);
     lowerer.run();
@@ -211,8 +214,9 @@ class LowerAccess : public IRVisitor {
 
   static bool run(IRNode *node,
                   const std::vector<SNode *> &kernel_forces_no_activate,
-                  bool lower_atomic) {
-    LowerAccess inst(kernel_forces_no_activate, lower_atomic);
+                  bool lower_atomic,
+                  bool packed) {
+    LowerAccess inst(kernel_forces_no_activate, lower_atomic, packed);
     bool modified = false;
     while (true) {
       node->accept(&inst);
@@ -303,8 +307,8 @@ namespace irpass {
 bool lower_access(IRNode *root,
                   const CompileConfig &config,
                   const LowerAccessPass::Args &args) {
-  bool modified =
-      LowerAccess::run(root, args.kernel_forces_no_activate, args.lower_atomic);
+  bool modified = LowerAccess::run(root, args.kernel_forces_no_activate,
+                                   args.lower_atomic, config.packed);
   type_check(root, config);
   return modified;
 }
diff --git a/taichi/transforms/scalar_pointer_lowerer.cpp b/taichi/transforms/scalar_pointer_lowerer.cpp
index 9a922452e..61cce2aa6 100644
--- a/taichi/transforms/scalar_pointer_lowerer.cpp
+++ b/taichi/transforms/scalar_pointer_lowerer.cpp
@@ -6,6 +6,7 @@
 #include "taichi/ir/snode.h"
 #include "taichi/ir/statements.h"
 #include "taichi/transforms/scalar_pointer_lowerer.h"
+#include "taichi/transforms/utils.h"
 
 namespace taichi {
 namespace lang {
@@ -14,11 +15,13 @@ ScalarPointerLowerer::ScalarPointerLowerer(SNode *leaf_snode,
                                            const std::vector<Stmt *> &indices,
                                            const SNodeOpType snode_op,
                                            const bool is_bit_vectorized,
-                                           VecStatement *lowered)
+                                           VecStatement *lowered,
+                                           const bool packed)
     : indices_(indices),
       snode_op_(snode_op),
       is_bit_vectorized_(is_bit_vectorized),
-      lowered_(lowered) {
+      lowered_(lowered),
+      packed_(packed) {
   for (auto *s = leaf_snode; s != nullptr; s = s->parent) {
     snodes_.push_back(s);
   }
@@ -45,6 +48,14 @@ void ScalarPointerLowerer::run() {
       start_bits[j] += s->extractors[j].num_bits;
     }
   }
+  // general shape calculation - no dependence on POT
+  std::array<int, taichi_max_num_indices> total_shape;
+  total_shape.fill(1);
+  for (const auto *s : snodes_) {
+    for (int j = 0; j < taichi_max_num_indices; j++) {
+      total_shape[j] *= s->extractors[j].shape;
+    }
+  }
 
   if (path_length_ == 0)
     return;
@@ -59,19 +70,26 @@ void ScalarPointerLowerer::run() {
     }
     std::vector<Stmt *> lowered_indices;
     std::vector<int> strides;
-    // extract bits
+    // extract lowered indices
     for (int k_ = 0; k_ < (int)indices_.size(); k_++) {
-      for (int k = 0; k < taichi_max_num_indices; k++) {
-        if (snode->physical_index_position[k_] == k) {
-          start_bits[k] -= snode->extractors[k].num_bits;
-          const int begin = start_bits[k];
-          const int end = begin + snode->extractors[k].num_bits;
-          auto extracted = Stmt::make<BitExtractStmt>(indices_[k_], begin, end);
-          lowered_indices.push_back(extracted.get());
-          lowered_->push_back(std::move(extracted));
-          strides.push_back(1 << snode->extractors[k].num_bits);
-        }
+      int k = snode->physical_index_position[k_];
+      if (k < 0)
+        continue;
+      Stmt *extracted;
+      if (packed_) {  // no dependence on POT
+        const int prev = total_shape[k];
+        total_shape[k] /= snode->extractors[k].shape;
+        const int next = total_shape[k];
+        extracted = generate_mod_x_div_y(lowered_, indices_[k_], prev, next);
+      } else {
+        const int end = start_bits[k];
+        start_bits[k] -= snode->extractors[k].num_bits;
+        const int begin = start_bits[k];
+        extracted =
+            lowered_->push_back<BitExtractStmt>(indices_[k_], begin, end);
       }
+      lowered_indices.push_back(extracted);
+      strides.push_back(snode->extractors[k].shape);
     }
     // linearize
     auto *linearized =
diff --git a/taichi/transforms/scalar_pointer_lowerer.h b/taichi/transforms/scalar_pointer_lowerer.h
index 574e9c9e6..a763ad3a6 100644
--- a/taichi/transforms/scalar_pointer_lowerer.h
+++ b/taichi/transforms/scalar_pointer_lowerer.h
@@ -31,7 +31,8 @@ class ScalarPointerLowerer {
                                 const std::vector<Stmt *> &indices,
                                 const SNodeOpType snode_op,
                                 const bool is_bit_vectorized,
-                                VecStatement *lowered);
+                                VecStatement *lowered,
+                                const bool packed);
 
   virtual ~ScalarPointerLowerer() = default;
   /**
@@ -67,6 +68,7 @@ class ScalarPointerLowerer {
   const SNodeOpType snode_op_;
   const bool is_bit_vectorized_;
   VecStatement *const lowered_;
+  const bool packed_;
 
  private:
   std::vector<SNode *> snodes_;
diff --git a/taichi/transforms/utils.cpp b/taichi/transforms/utils.cpp
new file mode 100644
index 000000000..d3dd28bcc
--- /dev/null
+++ b/taichi/transforms/utils.cpp
@@ -0,0 +1,14 @@
+#include "taichi/ir/statements.h"
+
+namespace taichi {
+namespace lang {
+
+Stmt *generate_mod_x_div_y(VecStatement *stmts, Stmt *num, int x, int y) {
+  auto const_x = stmts->push_back<ConstStmt>(TypedConstant(x));
+  auto mod_x = stmts->push_back<BinaryOpStmt>(BinaryOpType::mod, num, const_x);
+  auto const_y = stmts->push_back<ConstStmt>(TypedConstant(y));
+  return stmts->push_back<BinaryOpStmt>(BinaryOpType::div, mod_x, const_y);
+}
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/taichi/transforms/utils.h b/taichi/transforms/utils.h
new file mode 100644
index 000000000..3440be031
--- /dev/null
+++ b/taichi/transforms/utils.h
@@ -0,0 +1,9 @@
+#pragma once
+
+namespace taichi {
+namespace lang {
+
+Stmt *generate_mod_x_div_y(VecStatement *stmts, Stmt *num, int x, int y);
+
+}  // namespace lang
+}  // namespace taichi
diff --git a/tests/cpp/codegen/refine_coordinates_test.cpp b/tests/cpp/codegen/refine_coordinates_test.cpp
index 5c04809bc..918ff832f 100644
--- a/tests/cpp/codegen/refine_coordinates_test.cpp
+++ b/tests/cpp/codegen/refine_coordinates_test.cpp
@@ -103,6 +103,7 @@ class RefineCoordinatesTest : public ::testing::Test {
  protected:
   void SetUp() override {
     arch_ = host_arch();
+    config_.packed = false;
     config_.print_kernel_llvm_ir = false;
     prog_ = std::make_unique<Program>(arch_);
     tlctx_ = prog_->llvm_context_host.get();
diff --git a/tests/cpp/struct/fake_struct_compiler.h b/tests/cpp/struct/fake_struct_compiler.h
index e409a01a1..05ab97f25 100644
--- a/tests/cpp/struct/fake_struct_compiler.h
+++ b/tests/cpp/struct/fake_struct_compiler.h
@@ -12,7 +12,7 @@ class FakeStructCompiler : public StructCompiler {
   }
 
   void run(SNode &root) override {
-    infer_snode_properties(root);
+    infer_snode_properties(root, false);
   }
 };
 
diff --git a/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp b/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp
index 7de569a3c..9a5d3a87e 100644
--- a/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp
+++ b/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp
@@ -62,7 +62,9 @@ TEST_F(ScalarPointerLowererTest, Basic) {
       LowererImpl lowerer{leaf_snode_,
                           std::vector<Stmt *>{builder.get_int32(loop_index)},
                           SNodeOpType::undefined,
-                          /*is_bit_vectorized=*/false, &lowered};
+                          /*is_bit_vectorized=*/false,
+                          &lowered,
+                          /*packed=*/false};
       lowerer.run();
       // There are three linearized stmts:
       // 0: for root
diff --git a/tests/python/test_indices_assert.py b/tests/python/test_indices_assert.py
index cfc2c1e46..ce6a0ce9a 100644
--- a/tests/python/test_indices_assert.py
+++ b/tests/python/test_indices_assert.py
@@ -7,7 +7,7 @@
 
 @pytest.mark.skipif(platform.system() == 'Windows',
                     reason="Too much virtual memory for github windows env.")
-@ti.test(debug=True, gdb_trigger=False, arch=[ti.cpu])
+@ti.test(debug=True, gdb_trigger=False, packed=False, arch=[ti.cpu])
 def test_indices_assert():
 
     overflow = ti.field(ti.i32, (334, 334, 334, 2 * 10))
diff --git a/tests/python/test_packed_size.py b/tests/python/test_packed_size.py
new file mode 100644
index 000000000..19837ec4f
--- /dev/null
+++ b/tests/python/test_packed_size.py
@@ -0,0 +1,8 @@
+import taichi as ti
+
+
+@ti.test(packed=True)
+def test_packed_size():
+    x = ti.field(ti.i32)
+    ti.root.dense(ti.i, 20).dense(ti.ijk, 334).place(x)
+    assert x.snode.parent().parent().cell_size_bytes == 4 * 334**3