diff --git a/.github/workflows/presubmit.yml b/.github/workflows/presubmit.yml index 265de5bc2..a451d6529 100644 --- a/.github/workflows/presubmit.yml +++ b/.github/workflows/presubmit.yml @@ -4,51 +4,9 @@ on: types: [opened, synchronize, reopened] jobs: - title_format: - name: Check PR Title - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - - name: Run PR Title Checker - run: | - pip install semver GitPython - python misc/ci_check_pr_title.py "$PR_TITLE" - env: - PR_TITLE: ${{ github.event.pull_request.title }} - - check_code_format: - name: Check Code Format - runs-on: ubuntu-latest - # This job will be required to pass before merging to master branch. - steps: - - uses: actions/checkout@v2 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: 3.8 - - name: Check code format - run: | - git config user.email "taichigardener@gmail.com" - git config user.name "Taichi Gardener" - git checkout -b _fake_squash - git remote add upstream https://github.com/taichi-dev/taichi.git - git fetch upstream master - sudo apt-get install clang-format - python3 -m pip install --user yapf gitpython colorama isort - python3 python/taichi/code_format.py - git checkout -b _enforced_format - git commit -am "enforce code format" || true - # exit with 1 if there were differences: - git diff _fake_squash _enforced_format --exit-code - build_and_test_cpu_required: # This job will be required to pass before merging to master branch. name: Required Build and Test (CPU) - needs: check_code_format strategy: matrix: include: @@ -95,7 +53,6 @@ jobs: build_and_test_cpu: name: Build and Test (CPU) - needs: build_and_test_cpu_required strategy: matrix: include: @@ -156,91 +113,3 @@ jobs: ti test -vr2 -t2 env: RUN_CPP_TESTS: ${{ matrix.with_cpp_tests }} - - build_and_test_gpu_linux: - name: Build and Test (GPU) - needs: check_code_format - runs-on: [self-hosted, cuda, cn] - steps: - - uses: actions/checkout@v2 - - - name: Build - run: | - git --version - export PATH=/home/github/taichi-llvm/bin/:$PATH - export CXX=clang++-8 - export PYTHON=/usr/bin/python3 - $PYTHON misc/ci_setup.py ci - env: - CI_SETUP_CMAKE_ARGS: -DTI_WITH_OPENGL:BOOL=ON -DTI_WITH_CC:BOOL=OFF - - - name: Test - run: | - export PYTHON=/usr/bin/python3 - export PATH=/home/github/taichi-llvm/bin/:$PATH - export PATH=$PATH:$HOME/.local/bin - export DISPLAY=:1 - hash -r - glewinfo - $PYTHON examples/algorithm/laplace.py - ti diagnose - ti test -vr2 -t2 - - build_and_test_windows: - name: Build and Test (Windows) - needs: check_code_format - runs-on: windows-latest - steps: - - name: Install 7Zip PowerShell - shell: powershell - run: Install-Module 7Zip4PowerShell -Force -Verbose - - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - with: - python-version: 3.7 - - - name: Add msbuild to PATH - uses: microsoft/setup-msbuild@v1.0.2 - - - name: Build - shell: powershell - run: | - $env:TAICHI_REPO_DIR = "D:\a\taichi\taichi" - $env:PYTHONPATH = "$env:TAICHI_REPO_DIR\python" - cd C:\ - Remove-item alias:curl - curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/taichi-llvm-10.0.0-msvc2019.zip -LO - 7z x taichi-llvm-10.0.0-msvc2019.zip -otaichi_llvm - curl --retry 10 --retry-delay 5 https://github.com/taichi-dev/taichi_assets/releases/download/llvm10/clang-10.0.0-win.zip -LO - 7z x clang-10.0.0-win.zip -otaichi_clang - $env:PATH += ";C:\taichi_llvm\bin" - $env:PATH += ";C:\taichi_clang\bin" - $env:PATH += ";$env:TAICHI_REPO_DIR\bin" - clang --version - cd D:\a\taichi\taichi - python -m pip install numpy - python -m pip install pybind11 - python misc/ci_setup.py ci - mkdir build - cd build - cmake .. -G"Visual Studio 16 2019" -A x64 -DPYTHON_EXECUTABLE="$env:PYTHON" -DLLVM_DIR="C:\taichi_llvm\lib\cmake\llvm" - msbuild /p:Configuration=RelWithDebInfo /p:Platform=x64 /m taichi.sln - cd .. - env: - PYTHON: C:\hostedtoolcache\windows\Python\3.7.9\x64\python.exe - - - name: Test - shell: powershell - run: | - $env:TAICHI_REPO_DIR = "D:\a\taichi\taichi" - $env:PYTHONPATH = "$env:TAICHI_REPO_DIR\python" - $env:PATH += ";C:\taichi_llvm\bin" - $env:PATH += ";C:\taichi_clang\bin" - $env:PATH += ";$env:TAICHI_REPO_DIR\bin" - python -c "import taichi" - python examples/algorithm/laplace.py - python bin/taichi diagnose - python bin/taichi test -Cvr2 -t2 - env: - PYTHON: C:\hostedtoolcache\windows\Python\3.7.9\x64\python.exe diff --git a/CMakeLists.txt b/CMakeLists.txt index 181ed4b34..990ff3489 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -94,8 +94,8 @@ endif() foreach(arch IN LISTS HOST_ARCH CUDA_ARCH) add_custom_target( "generate_llvm_runtime_${arch}" - COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR}; - COMMAND ${LLVM_AS_EXECUTABLE} runtime.ll -o "runtime_${arch}.bc" + COMMAND ${CLANG_EXECUTABLE} -S runtime.cpp -o runtime_${arch}.ll -fno-exceptions -emit-llvm -std=c++17 -D "ARCH_${arch}" -I ${PROJECT_SOURCE_DIR}; + COMMAND ${LLVM_AS_EXECUTABLE} runtime_${arch}.ll -o "runtime_${arch}.bc" WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}/taichi/runtime/llvm" ) add_dependencies(${CORE_LIBRARY_NAME} "generate_llvm_runtime_${arch}") diff --git a/taichi/analysis/bls_analyzer.cpp b/taichi/analysis/bls_analyzer.cpp index d2105daca..868df66ab 100644 --- a/taichi/analysis/bls_analyzer.cpp +++ b/taichi/analysis/bls_analyzer.cpp @@ -25,8 +25,7 @@ void BLSAnalyzer::generate_block_indices(SNode *snode, BlockIndices *indices) { // NOTE: Assuming not vectorized for (int i = 0; i < snode->num_active_indices; i++) { auto j = snode->physical_index_position[i]; - indices->push_back( - {/*low=*/0, /*high=*/(1 << snode->extractors[j].num_bits) - 1}); + indices->push_back({/*low=*/0, /*high=*/snode->extractors[j].shape - 1}); } } diff --git a/taichi/backends/metal/kernel_manager.cpp b/taichi/backends/metal/kernel_manager.cpp index fc48b7f01..6b311564d 100644 --- a/taichi/backends/metal/kernel_manager.cpp +++ b/taichi/backends/metal/kernel_manager.cpp @@ -816,9 +816,10 @@ class KernelManager::Impl { const auto &ext = sn->extractors[j]; rtm_ext->extractors[j].num_bits = ext.num_bits; rtm_ext->extractors[j].acc_offset = ext.acc_offset; - rtm_ext->extractors[j].num_elements = ext.num_elements; - TI_DEBUG(" [{}] num_bits={} acc_offset={} num_elements={}", j, - ext.num_bits, ext.acc_offset, ext.num_elements); + rtm_ext->extractors[j].num_elements_from_root = + ext.num_elements_from_root; + TI_DEBUG(" [{}] num_bits={} acc_offset={} num_elements_from_root={}", + j, ext.num_bits, ext.acc_offset, ext.num_elements_from_root); } TI_DEBUG(""); } diff --git a/taichi/backends/metal/shaders/runtime_structs.metal.h b/taichi/backends/metal/shaders/runtime_structs.metal.h index 171e8463f..9b3347809 100644 --- a/taichi/backends/metal/shaders/runtime_structs.metal.h +++ b/taichi/backends/metal/shaders/runtime_structs.metal.h @@ -117,7 +117,7 @@ STR( int32_t start = 0; int32_t num_bits = 0; int32_t acc_offset = 0; - int32_t num_elements = 0; + int32_t num_elements_from_root = 0; }; Extractor extractors[kTaichiMaxNumIndices]; diff --git a/taichi/backends/metal/struct_metal.cpp b/taichi/backends/metal/struct_metal.cpp index 8504e046b..f0c654693 100644 --- a/taichi/backends/metal/struct_metal.cpp +++ b/taichi/backends/metal/struct_metal.cpp @@ -337,7 +337,7 @@ class StructCompiler { } sn_desc.total_num_elems_from_root = 1; for (const auto &e : sn->extractors) { - sn_desc.total_num_elems_from_root *= e.num_elements; + sn_desc.total_num_elems_from_root *= e.num_elements_from_root; } TI_ASSERT(snode_descriptors_.find(sn->id) == snode_descriptors_.end()); diff --git a/taichi/codegen/codegen_llvm.cpp b/taichi/codegen/codegen_llvm.cpp index 59d4b9335..3e1bd8c05 100644 --- a/taichi/codegen/codegen_llvm.cpp +++ b/taichi/codegen/codegen_llvm.cpp @@ -1673,15 +1673,18 @@ void CodeGenLLVM::create_offload_struct_for(OffloadedStmt *stmt, bool spmd) { auto coord_object = RuntimeObject(kLLVMPhysicalCoordinatesName, this, builder.get(), new_coordinates); - for (int i = 0; i < snode->num_active_indices; i++) { - auto j = snode->physical_index_position[i]; - if (!bit::is_power_of_two(snode->extractors[j].num_elements)) { - auto coord = coord_object.get("val", tlctx->get_constant(j)); - exec_cond = builder->CreateAnd( - exec_cond, - builder->CreateICmp( - llvm::CmpInst::ICMP_SLT, coord, - tlctx->get_constant(snode->extractors[j].num_elements))); + if (!prog->config.packed) { + for (int i = 0; i < snode->num_active_indices; i++) { + auto j = snode->physical_index_position[i]; + if (!bit::is_power_of_two( + snode->extractors[j].num_elements_from_root)) { + auto coord = coord_object.get("val", tlctx->get_constant(j)); + exec_cond = builder->CreateAnd( + exec_cond, builder->CreateICmp( + llvm::CmpInst::ICMP_SLT, coord, + tlctx->get_constant( + snode->extractors[j].num_elements_from_root))); + } } } diff --git a/taichi/ir/scratch_pad.h b/taichi/ir/scratch_pad.h index 7081be117..7df97db99 100644 --- a/taichi/ir/scratch_pad.h +++ b/taichi/ir/scratch_pad.h @@ -103,8 +103,7 @@ class ScratchPad { block_size.resize(dim); for (int i = 0; i < dim; i++) { block_size[i] = - 1 << snode->parent->extractors[snode->physical_index_position[i]] - .num_bits; + snode->parent->extractors[snode->physical_index_position[i]].shape; TI_ASSERT(bounds[i].low != std::numeric_limits::max()); TI_ASSERT(bounds[i].high != std::numeric_limits::min()); } diff --git a/taichi/ir/snode.cpp b/taichi/ir/snode.cpp index 2bdf25156..60d6da07a 100644 --- a/taichi/ir/snode.cpp +++ b/taichi/ir/snode.cpp @@ -2,6 +2,7 @@ #include "taichi/ir/ir.h" #include "taichi/ir/statements.h" +#include "taichi/program/program.h" TLANG_NAMESPACE_BEGIN @@ -34,23 +35,13 @@ SNode &SNode::create_node(std::vector indices, "hashed node must be child of root due to initialization " "memset limitation."); auto &new_node = insert_children(type); - new_node.n = 1; - for (int i = 0; i < sizes.size(); i++) { - auto s = sizes[i]; - TI_ASSERT(sizes[i] > 0); - if (!bit::is_power_of_two(s)) { - auto promoted_s = bit::least_pot_bound(s); - TI_DEBUG("Non-power-of-two node size {} promoted to {}.", s, promoted_s); - s = promoted_s; - } - TI_ASSERT(bit::is_power_of_two(s)); - new_node.n *= s; - } for (int i = 0; i < (int)indices.size(); i++) { + TI_ASSERT(sizes[i] > 0); auto &ind = indices[i]; new_node.extractors[ind.value].activate( bit::log2int(bit::least_pot_bound(sizes[i]))); - new_node.extractors[ind.value].num_elements = sizes[i]; + new_node.extractors[ind.value].shape = sizes[i]; + new_node.extractors[ind.value].num_elements_from_root = sizes[i]; } return new_node; } @@ -99,7 +90,7 @@ SNode *SNode::get_least_sparse_ancestor() const { int SNode::shape_along_axis(int i) const { const auto &extractor = extractors[physical_index_position[i]]; - return extractor.num_elements; + return extractor.num_elements_from_root; } SNode::SNode() : SNode(0, SNodeType::undefined) { diff --git a/taichi/ir/snode.h b/taichi/ir/snode.h index 5740e0d27..060acc332 100644 --- a/taichi/ir/snode.h +++ b/taichi/ir/snode.h @@ -34,26 +34,34 @@ class Index { */ struct IndexExtractor { /** - * Shape at the given index. + * Number of elements from root at this index. * - * This is the raw shape, *not* padded to power-of-two (POT). + * This is the raw number, *not* padded to power-of-two (POT). */ - int num_elements{1}; + int num_elements_from_root{1}; + /** + * Shape at this index (POT or packed) according to the config. + */ + int shape{1}; + /** + * Accumulated shape from the last activated index to the first one. + */ + int acc_shape{1}; /** * Number of bits needed to store the coordinate at this index. * - * ceil(log2(num_elements)) + * ceil(log2(shape)) */ int num_bits{0}; /** * Accumulated offset from the last activated index to the first one. * - * This is the starting bit of this index in a linearized 1D coordiate. For + * This is the starting bit of this index in a linearized 1D coordinate. For * example, assuming an SNode of (ti.ijk, shape=(4, 8, 16)). ti.i takes 2 * bits, ti.j 3 bits and ti.k 4 bits. Then for a linearized coordinate: - * ti.k uses bits [0, 3), acc_offset=0 - * tk.j uses btis [3, 6), acc_offset=3 - * ti.i uses bits [6, 8), acc_offset=6 + * ti.k uses bits [0, 4), acc_offset=0 + * ti.j uses bits [4, 7), acc_offset=4 + * ti.i uses bits [7, 9), acc_offset=7 */ int acc_offset{0}; /** @@ -106,7 +114,7 @@ class SNode { int depth{0}; std::string name; - int64 n{0}; + int64 n{1}; int total_num_bits{0}; int total_bit_start{0}; int chunk_size{0}; @@ -283,7 +291,7 @@ class SNode { } int64 max_num_elements() const { - return int64(1) << total_num_bits; + return n; } int shape_along_axis(int i) const; diff --git a/taichi/ir/transforms.h b/taichi/ir/transforms.h index 5b6059916..6b22cc475 100644 --- a/taichi/ir/transforms.h +++ b/taichi/ir/transforms.h @@ -106,7 +106,7 @@ bool replace_and_insert_statements( bool replace_statements(IRNode *root, std::function filter, std::function finder); -void demote_dense_struct_fors(IRNode *root); +void demote_dense_struct_fors(IRNode *root, bool packed); bool demote_atomics(IRNode *root, const CompileConfig &config); void reverse_segments(IRNode *root); // for autograd void detect_read_only(IRNode *root); diff --git a/taichi/program/compile_config.cpp b/taichi/program/compile_config.cpp index a06805f48..9e126b589 100644 --- a/taichi/program/compile_config.cpp +++ b/taichi/program/compile_config.cpp @@ -8,6 +8,7 @@ CompileConfig::CompileConfig() { arch = host_arch(); simd_width = default_simd_width(arch); external_optimization_level = 3; + packed = false; print_ir = false; print_accessor_ir = false; print_evaluator_ir = false; diff --git a/taichi/program/compile_config.h b/taichi/program/compile_config.h index e42d4e265..3c2d9b465 100644 --- a/taichi/program/compile_config.h +++ b/taichi/program/compile_config.h @@ -14,6 +14,7 @@ struct CompileConfig { bool lazy_compilation; int external_optimization_level; int max_vector_width; + bool packed; bool print_ir; bool print_accessor_ir; bool print_evaluator_ir; diff --git a/taichi/program/program.cpp b/taichi/program/program.cpp index 0b11a2061..b8c88894e 100644 --- a/taichi/program/program.cpp +++ b/taichi/program/program.cpp @@ -429,7 +429,7 @@ void Program::initialize_llvm_runtime_snodes(const SNodeTree *tree, int Program::add_snode_tree(std::unique_ptr root) { const int id = snode_trees_.size(); - auto tree = std::make_unique(id, std::move(root)); + auto tree = std::make_unique(id, std::move(root), config.packed); tree->root()->set_snode_tree_id(id); materialize_snode_tree(tree.get()); snode_trees_.push_back(std::move(tree)); diff --git a/taichi/python/export_lang.cpp b/taichi/python/export_lang.cpp index ff94466ec..ed469c619 100644 --- a/taichi/python/export_lang.cpp +++ b/taichi/python/export_lang.cpp @@ -119,6 +119,7 @@ void export_lang(py::module &m) { py::class_(m, "CompileConfig") .def(py::init<>()) .def_readwrite("arch", &CompileConfig::arch) + .def_readwrite("packed", &CompileConfig::packed) .def_readwrite("print_ir", &CompileConfig::print_ir) .def_readwrite("debug", &CompileConfig::debug) .def_readwrite("cfg_optimization", &CompileConfig::cfg_optimization) diff --git a/taichi/struct/snode_tree.cpp b/taichi/struct/snode_tree.cpp index f26a48543..4893b66ab 100644 --- a/taichi/struct/snode_tree.cpp +++ b/taichi/struct/snode_tree.cpp @@ -5,9 +5,9 @@ namespace taichi { namespace lang { -SNodeTree::SNodeTree(int id, std::unique_ptr root) +SNodeTree::SNodeTree(int id, std::unique_ptr root, bool packed) : id_(id), root_(std::move(root)) { - infer_snode_properties(*root_); + infer_snode_properties(*root_, packed); } } // namespace lang diff --git a/taichi/struct/snode_tree.h b/taichi/struct/snode_tree.h index 58f58f2d3..30ef34dc4 100644 --- a/taichi/struct/snode_tree.h +++ b/taichi/struct/snode_tree.h @@ -22,7 +22,7 @@ class SNodeTree { * @param id Id of the tree * @param root Root of the tree */ - explicit SNodeTree(int id, std::unique_ptr root); + explicit SNodeTree(int id, std::unique_ptr root, bool packed); int id() const { return id_; diff --git a/taichi/struct/struct.cpp b/taichi/struct/struct.cpp index 01115c79b..d4b8355e7 100644 --- a/taichi/struct/struct.cpp +++ b/taichi/struct/struct.cpp @@ -9,12 +9,13 @@ namespace taichi { namespace lang { -void infer_snode_properties(SNode &snode) { +void infer_snode_properties(SNode &snode, bool packed) { for (int ch_id = 0; ch_id < (int)snode.ch.size(); ch_id++) { auto &ch = snode.ch[ch_id]; ch->parent = &snode; for (int i = 0; i < taichi_max_num_indices; i++) { - ch->extractors[i].num_elements *= snode.extractors[i].num_elements; + ch->extractors[i].num_elements_from_root *= + snode.extractors[i].num_elements_from_root; bool found = false; for (int k = 0; k < taichi_max_num_indices; k++) { if (snode.physical_index_position[k] == i) { @@ -40,15 +41,27 @@ void infer_snode_properties(SNode &snode) { ch->is_bit_level = snode.is_bit_level; } - infer_snode_properties(*ch); + infer_snode_properties(*ch, packed); } // infer extractors + int acc_shape = 1; + for (int i = taichi_max_num_indices - 1; i >= 0; i--) { + // if not in packed mode, pad shape to POT + if (!packed) { + snode.extractors[i].shape = 1 << snode.extractors[i].num_bits; + } + snode.extractors[i].acc_shape = acc_shape; + acc_shape *= snode.extractors[i].shape; + } + snode.n = acc_shape; + // infer extractors (only for POT) int acc_offsets = 0; for (int i = taichi_max_num_indices - 1; i >= 0; i--) { snode.extractors[i].acc_offset = acc_offsets; acc_offsets += snode.extractors[i].num_bits; } + snode.total_num_bits = acc_offsets; if (snode.type == SNodeType::dynamic) { int active_extractor_counder = 0; for (int i = 0; i < taichi_max_num_indices; i++) { @@ -67,10 +80,6 @@ void infer_snode_properties(SNode &snode) { "Dynamic SNode can have only one index extractor."); } - snode.total_num_bits = 0; - for (int i = 0; i < taichi_max_num_indices; i++) { - snode.total_num_bits += snode.extractors[i].num_bits; - } // The highest bit is for the sign. constexpr int kMaxTotalNumBits = 64; TI_ERROR_IF( diff --git a/taichi/struct/struct.h b/taichi/struct/struct.h index 7883fd12b..dd7be4af8 100644 --- a/taichi/struct/struct.h +++ b/taichi/struct/struct.h @@ -12,7 +12,7 @@ namespace lang { * * @param snode The root SNode to compute. */ -void infer_snode_properties(SNode &snode); +void infer_snode_properties(SNode &snode, bool packed); class StructCompiler { public: diff --git a/taichi/struct/struct_llvm.cpp b/taichi/struct/struct_llvm.cpp index c8246271d..5c32b8681 100644 --- a/taichi/struct/struct_llvm.cpp +++ b/taichi/struct/struct_llvm.cpp @@ -192,20 +192,39 @@ void StructCompilerLLVM::generate_refine_coordinates(SNode *snode) { auto outp_coords = args[1]; auto l = args[2]; - for (int i = 0; i < taichi_max_num_indices; i++) { - auto addition = tlctx_->get_constant(0); - if (snode->extractors[i].num_bits) { - auto mask = ((1 << snode->extractors[i].num_bits) - 1); - addition = builder.CreateAnd( - builder.CreateAShr(l, snode->extractors[i].acc_offset), mask); + if (config_->packed) { // no dependence on POT + for (int i = 0; i < taichi_max_num_indices; i++) { + auto addition = tlctx_->get_constant(0); + if (snode->extractors[i].shape > 1) { + auto prev = tlctx_->get_constant(snode->extractors[i].acc_shape * + snode->extractors[i].shape); + auto next = tlctx_->get_constant(snode->extractors[i].acc_shape); + addition = builder.CreateSDiv(builder.CreateSRem(l, prev), next); + } + auto in = call(&builder, "PhysicalCoordinates_get_val", inp_coords, + tlctx_->get_constant(i)); + in = builder.CreateMul(in, + tlctx_->get_constant(snode->extractors[i].shape)); + auto added = builder.CreateAdd(in, addition); + call(&builder, "PhysicalCoordinates_set_val", outp_coords, + tlctx_->get_constant(i), added); + } + } else { + for (int i = 0; i < taichi_max_num_indices; i++) { + auto addition = tlctx_->get_constant(0); + if (snode->extractors[i].num_bits) { + auto mask = ((1 << snode->extractors[i].num_bits) - 1); + addition = builder.CreateAnd( + builder.CreateAShr(l, snode->extractors[i].acc_offset), mask); + } + auto in = call(&builder, "PhysicalCoordinates_get_val", inp_coords, + tlctx_->get_constant(i)); + in = builder.CreateShl( + in, tlctx_->get_constant(snode->extractors[i].num_bits)); + auto added = builder.CreateOr(in, addition); + call(&builder, "PhysicalCoordinates_set_val", outp_coords, + tlctx_->get_constant(i), added); } - auto in = call(&builder, "PhysicalCoordinates_get_val", inp_coords, - tlctx_->get_constant(i)); - in = builder.CreateShl(in, - tlctx_->get_constant(snode->extractors[i].num_bits)); - auto added = builder.CreateOr(in, addition); - call(&builder, "PhysicalCoordinates_set_val", outp_coords, - tlctx_->get_constant(i), added); } builder.CreateRetVoid(); } diff --git a/taichi/transforms/compile_to_offloads.cpp b/taichi/transforms/compile_to_offloads.cpp index 367b7f2cb..24778f996 100644 --- a/taichi/transforms/compile_to_offloads.cpp +++ b/taichi/transforms/compile_to_offloads.cpp @@ -173,7 +173,7 @@ void offload_to_executable(IRNode *ir, irpass::analysis::verify(ir); if (config.demote_dense_struct_fors) { - irpass::demote_dense_struct_fors(ir); + irpass::demote_dense_struct_fors(ir, config.packed); irpass::type_check(ir, config); print("Dense struct-for demoted"); irpass::analysis::verify(ir); diff --git a/taichi/transforms/demote_dense_struct_fors.cpp b/taichi/transforms/demote_dense_struct_fors.cpp index 23656aa74..2bc995547 100644 --- a/taichi/transforms/demote_dense_struct_fors.cpp +++ b/taichi/transforms/demote_dense_struct_fors.cpp @@ -2,6 +2,7 @@ #include "taichi/ir/statements.h" #include "taichi/ir/transforms.h" #include "taichi/ir/visitors.h" +#include "taichi/transforms/utils.h" TLANG_NAMESPACE_BEGIN @@ -9,7 +10,7 @@ namespace { using TaskType = OffloadedStmt::TaskType; -void convert_to_range_for(OffloadedStmt *offloaded) { +void convert_to_range_for(OffloadedStmt *offloaded, bool packed) { TI_ASSERT(offloaded->task_type == TaskType::struct_for); std::vector snodes; @@ -27,10 +28,21 @@ void convert_to_range_for(OffloadedStmt *offloaded) { std::reverse(snodes.begin(), snodes.end()); TI_ASSERT(total_bits <= 30); + // general shape calculation - no dependence on POT + int total_n = 1; + std::array total_shape; + total_shape.fill(1); + for (const auto *s : snodes) { + for (int j = 0; j < taichi_max_num_indices; j++) { + total_shape[j] *= s->extractors[j].shape; + } + total_n *= s->n; + } + offloaded->const_begin = true; offloaded->const_end = true; offloaded->begin_value = 0; - offloaded->end_value = 1 << total_bits; + offloaded->end_value = total_n; ////// Begin core transformation auto body = std::move(offloaded->body); @@ -51,47 +63,70 @@ void convert_to_range_for(OffloadedStmt *offloaded) { auto main_loop_var = body_header.push_back(nullptr, 0); // We will set main_loop_var->loop later. - int offset = total_bits; - int start_bits[taichi_max_num_indices] = {0}; - std::copy(std::begin(start_bits_root), std::end(start_bits_root), - std::begin(start_bits)); Stmt *test = body_header.push_back(TypedConstant(-1)); bool has_test = false; - for (int i = 0; i < (int)snodes.size(); i++) { - auto snode = snodes[i]; - offset -= snode->total_num_bits; - for (int j = 0; j < (int)physical_indices.size(); j++) { - auto p = physical_indices[j]; - auto ext = snode->extractors[p]; - Stmt *delta = body_header.push_back( - main_loop_var, ext.acc_offset + offset, - ext.acc_offset + offset + ext.num_bits); - start_bits[p] -= ext.num_bits; - auto multiplier = - body_header.push_back(TypedConstant(1 << start_bits[p])); - delta = body_header.push_back(BinaryOpType::mul, delta, - multiplier); - new_loop_vars[j] = body_header.push_back( - BinaryOpType::add, new_loop_vars[j], delta); + if (packed) { // no dependence on POT + for (int i = 0; i < (int)snodes.size(); i++) { + auto snode = snodes[i]; + auto extracted = generate_mod_x_div_y(&body_header, main_loop_var, + total_n, total_n / snode->n); + total_n /= snode->n; + for (int j = 0; j < (int)physical_indices.size(); j++) { + auto p = physical_indices[j]; + auto ext = snode->extractors[p]; + auto index = generate_mod_x_div_y( + &body_header, extracted, ext.acc_shape * ext.shape, ext.acc_shape); + total_shape[p] /= ext.shape; + auto multiplier = + body_header.push_back(TypedConstant(total_shape[p])); + auto delta = body_header.push_back(BinaryOpType::mul, + index, multiplier); + new_loop_vars[j] = body_header.push_back( + BinaryOpType::add, new_loop_vars[j], delta); + } + } + } else { + int offset = total_bits; + int start_bits[taichi_max_num_indices] = {0}; + std::copy(std::begin(start_bits_root), std::end(start_bits_root), + std::begin(start_bits)); + for (int i = 0; i < (int)snodes.size(); i++) { + auto snode = snodes[i]; + offset -= snode->total_num_bits; + for (int j = 0; j < (int)physical_indices.size(); j++) { + auto p = physical_indices[j]; + auto ext = snode->extractors[p]; + Stmt *delta = body_header.push_back( + main_loop_var, ext.acc_offset + offset, + ext.acc_offset + offset + ext.num_bits); + start_bits[p] -= ext.num_bits; + auto multiplier = + body_header.push_back(TypedConstant(1 << start_bits[p])); + delta = body_header.push_back(BinaryOpType::mul, delta, + multiplier); + new_loop_vars[j] = body_header.push_back( + BinaryOpType::add, new_loop_vars[j], delta); + } } - } - std::copy(std::begin(start_bits_root), std::end(start_bits_root), - std::begin(start_bits)); - for (int i = 0; i < (int)snodes.size(); i++) { - auto snode = snodes[i]; - for (int j = 0; j < (int)physical_indices.size(); j++) { - auto p = physical_indices[j]; - start_bits[p] -= snode->extractors[p].num_bits; - auto num_elements = snode->extractors[p].num_elements << start_bits[p]; - if (!bit::is_power_of_two(num_elements)) { - has_test = true; - auto bound = - body_header.push_back(TypedConstant(num_elements)); - auto cmp = body_header.push_back(BinaryOpType::cmp_lt, - new_loop_vars[j], bound); - test = body_header.push_back(BinaryOpType::bit_and, test, - cmp); + std::copy(std::begin(start_bits_root), std::end(start_bits_root), + std::begin(start_bits)); + for (int i = 0; i < (int)snodes.size(); i++) { + auto snode = snodes[i]; + for (int j = 0; j < (int)physical_indices.size(); j++) { + auto p = physical_indices[j]; + start_bits[p] -= snode->extractors[p].num_bits; + auto num_elements = snode->extractors[p].num_elements_from_root + << start_bits[p]; + if (!bit::is_power_of_two(num_elements)) { + has_test = true; + auto bound = + body_header.push_back(TypedConstant(num_elements)); + auto cmp = body_header.push_back( + BinaryOpType::cmp_lt, new_loop_vars[j], bound); + test = body_header.push_back(BinaryOpType::bit_and, + test, cmp); + } } } } @@ -131,10 +166,10 @@ void convert_to_range_for(OffloadedStmt *offloaded) { offloaded->task_type = TaskType::range_for; } -void maybe_convert(OffloadedStmt *stmt) { +void maybe_convert(OffloadedStmt *stmt, bool packed) { if ((stmt->task_type == TaskType::struct_for) && stmt->snode->is_path_all_dense) { - convert_to_range_for(stmt); + convert_to_range_for(stmt, packed); } } @@ -142,15 +177,15 @@ void maybe_convert(OffloadedStmt *stmt) { namespace irpass { -void demote_dense_struct_fors(IRNode *root) { +void demote_dense_struct_fors(IRNode *root, bool packed) { if (auto *block = root->cast()) { for (auto &s_ : block->statements) { if (auto *s = s_->cast()) { - maybe_convert(s); + maybe_convert(s, packed); } } } else if (auto *s = root->cast()) { - maybe_convert(s); + maybe_convert(s, packed); } re_id(root); } diff --git a/taichi/transforms/lower_access.cpp b/taichi/transforms/lower_access.cpp index 299bfe53b..c2c44562a 100644 --- a/taichi/transforms/lower_access.cpp +++ b/taichi/transforms/lower_access.cpp @@ -46,11 +46,14 @@ class LowerAccess : public IRVisitor { StructForStmt *current_struct_for; const std::vector &kernel_forces_no_activate; bool lower_atomic_ptr; + bool packed; LowerAccess(const std::vector &kernel_forces_no_activate, - bool lower_atomic_ptr) + bool lower_atomic_ptr, + bool packed) : kernel_forces_no_activate(kernel_forces_no_activate), - lower_atomic_ptr(lower_atomic_ptr) { + lower_atomic_ptr(lower_atomic_ptr), + packed(packed) { // TODO: change this to false allow_undefined_visitor = true; current_struct_for = nullptr; @@ -99,8 +102,8 @@ class LowerAccess : public IRVisitor { TI_ASSERT(!pointer_needs_activation); } - PtrLowererImpl lowerer{leaf_snode, indices, snode_op, is_bit_vectorized, - lowered}; + PtrLowererImpl lowerer{leaf_snode, indices, snode_op, + is_bit_vectorized, lowered, packed}; lowerer.set_pointer_needs_activation(pointer_needs_activation); lowerer.set_lower_access(this); lowerer.run(); @@ -211,8 +214,9 @@ class LowerAccess : public IRVisitor { static bool run(IRNode *node, const std::vector &kernel_forces_no_activate, - bool lower_atomic) { - LowerAccess inst(kernel_forces_no_activate, lower_atomic); + bool lower_atomic, + bool packed) { + LowerAccess inst(kernel_forces_no_activate, lower_atomic, packed); bool modified = false; while (true) { node->accept(&inst); @@ -303,8 +307,8 @@ namespace irpass { bool lower_access(IRNode *root, const CompileConfig &config, const LowerAccessPass::Args &args) { - bool modified = - LowerAccess::run(root, args.kernel_forces_no_activate, args.lower_atomic); + bool modified = LowerAccess::run(root, args.kernel_forces_no_activate, + args.lower_atomic, config.packed); type_check(root, config); return modified; } diff --git a/taichi/transforms/scalar_pointer_lowerer.cpp b/taichi/transforms/scalar_pointer_lowerer.cpp index 9a922452e..61cce2aa6 100644 --- a/taichi/transforms/scalar_pointer_lowerer.cpp +++ b/taichi/transforms/scalar_pointer_lowerer.cpp @@ -6,6 +6,7 @@ #include "taichi/ir/snode.h" #include "taichi/ir/statements.h" #include "taichi/transforms/scalar_pointer_lowerer.h" +#include "taichi/transforms/utils.h" namespace taichi { namespace lang { @@ -14,11 +15,13 @@ ScalarPointerLowerer::ScalarPointerLowerer(SNode *leaf_snode, const std::vector &indices, const SNodeOpType snode_op, const bool is_bit_vectorized, - VecStatement *lowered) + VecStatement *lowered, + const bool packed) : indices_(indices), snode_op_(snode_op), is_bit_vectorized_(is_bit_vectorized), - lowered_(lowered) { + lowered_(lowered), + packed_(packed) { for (auto *s = leaf_snode; s != nullptr; s = s->parent) { snodes_.push_back(s); } @@ -45,6 +48,14 @@ void ScalarPointerLowerer::run() { start_bits[j] += s->extractors[j].num_bits; } } + // general shape calculation - no dependence on POT + std::array total_shape; + total_shape.fill(1); + for (const auto *s : snodes_) { + for (int j = 0; j < taichi_max_num_indices; j++) { + total_shape[j] *= s->extractors[j].shape; + } + } if (path_length_ == 0) return; @@ -59,19 +70,26 @@ void ScalarPointerLowerer::run() { } std::vector lowered_indices; std::vector strides; - // extract bits + // extract lowered indices for (int k_ = 0; k_ < (int)indices_.size(); k_++) { - for (int k = 0; k < taichi_max_num_indices; k++) { - if (snode->physical_index_position[k_] == k) { - start_bits[k] -= snode->extractors[k].num_bits; - const int begin = start_bits[k]; - const int end = begin + snode->extractors[k].num_bits; - auto extracted = Stmt::make(indices_[k_], begin, end); - lowered_indices.push_back(extracted.get()); - lowered_->push_back(std::move(extracted)); - strides.push_back(1 << snode->extractors[k].num_bits); - } + int k = snode->physical_index_position[k_]; + if (k < 0) + continue; + Stmt *extracted; + if (packed_) { // no dependence on POT + const int prev = total_shape[k]; + total_shape[k] /= snode->extractors[k].shape; + const int next = total_shape[k]; + extracted = generate_mod_x_div_y(lowered_, indices_[k_], prev, next); + } else { + const int end = start_bits[k]; + start_bits[k] -= snode->extractors[k].num_bits; + const int begin = start_bits[k]; + extracted = + lowered_->push_back(indices_[k_], begin, end); } + lowered_indices.push_back(extracted); + strides.push_back(snode->extractors[k].shape); } // linearize auto *linearized = diff --git a/taichi/transforms/scalar_pointer_lowerer.h b/taichi/transforms/scalar_pointer_lowerer.h index 574e9c9e6..a763ad3a6 100644 --- a/taichi/transforms/scalar_pointer_lowerer.h +++ b/taichi/transforms/scalar_pointer_lowerer.h @@ -31,7 +31,8 @@ class ScalarPointerLowerer { const std::vector &indices, const SNodeOpType snode_op, const bool is_bit_vectorized, - VecStatement *lowered); + VecStatement *lowered, + const bool packed); virtual ~ScalarPointerLowerer() = default; /** @@ -67,6 +68,7 @@ class ScalarPointerLowerer { const SNodeOpType snode_op_; const bool is_bit_vectorized_; VecStatement *const lowered_; + const bool packed_; private: std::vector snodes_; diff --git a/taichi/transforms/utils.cpp b/taichi/transforms/utils.cpp new file mode 100644 index 000000000..d3dd28bcc --- /dev/null +++ b/taichi/transforms/utils.cpp @@ -0,0 +1,14 @@ +#include "taichi/ir/statements.h" + +namespace taichi { +namespace lang { + +Stmt *generate_mod_x_div_y(VecStatement *stmts, Stmt *num, int x, int y) { + auto const_x = stmts->push_back(TypedConstant(x)); + auto mod_x = stmts->push_back(BinaryOpType::mod, num, const_x); + auto const_y = stmts->push_back(TypedConstant(y)); + return stmts->push_back(BinaryOpType::div, mod_x, const_y); +} + +} // namespace lang +} // namespace taichi diff --git a/taichi/transforms/utils.h b/taichi/transforms/utils.h new file mode 100644 index 000000000..3440be031 --- /dev/null +++ b/taichi/transforms/utils.h @@ -0,0 +1,9 @@ +#pragma once + +namespace taichi { +namespace lang { + +Stmt *generate_mod_x_div_y(VecStatement *stmts, Stmt *num, int x, int y); + +} // namespace lang +} // namespace taichi diff --git a/tests/cpp/codegen/refine_coordinates_test.cpp b/tests/cpp/codegen/refine_coordinates_test.cpp index 5c04809bc..918ff832f 100644 --- a/tests/cpp/codegen/refine_coordinates_test.cpp +++ b/tests/cpp/codegen/refine_coordinates_test.cpp @@ -103,6 +103,7 @@ class RefineCoordinatesTest : public ::testing::Test { protected: void SetUp() override { arch_ = host_arch(); + config_.packed = false; config_.print_kernel_llvm_ir = false; prog_ = std::make_unique(arch_); tlctx_ = prog_->llvm_context_host.get(); diff --git a/tests/cpp/struct/fake_struct_compiler.h b/tests/cpp/struct/fake_struct_compiler.h index e409a01a1..05ab97f25 100644 --- a/tests/cpp/struct/fake_struct_compiler.h +++ b/tests/cpp/struct/fake_struct_compiler.h @@ -12,7 +12,7 @@ class FakeStructCompiler : public StructCompiler { } void run(SNode &root) override { - infer_snode_properties(root); + infer_snode_properties(root, false); } }; diff --git a/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp b/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp index 7de569a3c..9a5d3a87e 100644 --- a/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp +++ b/tests/cpp/transforms/scalar_pointer_lowerer_test.cpp @@ -62,7 +62,9 @@ TEST_F(ScalarPointerLowererTest, Basic) { LowererImpl lowerer{leaf_snode_, std::vector{builder.get_int32(loop_index)}, SNodeOpType::undefined, - /*is_bit_vectorized=*/false, &lowered}; + /*is_bit_vectorized=*/false, + &lowered, + /*packed=*/false}; lowerer.run(); // There are three linearized stmts: // 0: for root diff --git a/tests/python/test_indices_assert.py b/tests/python/test_indices_assert.py index cfc2c1e46..ce6a0ce9a 100644 --- a/tests/python/test_indices_assert.py +++ b/tests/python/test_indices_assert.py @@ -7,7 +7,7 @@ @pytest.mark.skipif(platform.system() == 'Windows', reason="Too much virtual memory for github windows env.") -@ti.test(debug=True, gdb_trigger=False, arch=[ti.cpu]) +@ti.test(debug=True, gdb_trigger=False, packed=False, arch=[ti.cpu]) def test_indices_assert(): overflow = ti.field(ti.i32, (334, 334, 334, 2 * 10)) diff --git a/tests/python/test_packed_size.py b/tests/python/test_packed_size.py new file mode 100644 index 000000000..19837ec4f --- /dev/null +++ b/tests/python/test_packed_size.py @@ -0,0 +1,8 @@ +import taichi as ti + + +@ti.test(packed=True) +def test_packed_size(): + x = ti.field(ti.i32) + ti.root.dense(ti.i, 20).dense(ti.ijk, 334).place(x) + assert x.snode.parent().parent().cell_size_bytes == 4 * 334**3