diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 3d18507a..5ae64e1b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -22,6 +22,13 @@ set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
 #     set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}")
 # endif()
 
+# Set OpenMP
+find_package(OpenMP)
+include(ProcessorCount)
+ProcessorCount(MAX_NUMBER_THREADS)
+message("Setting maximum number of threads to ${MAX_NUMBER_THREADS}")
+set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DMAX_NUMBER_THREADS=${MAX_NUMBER_THREADS}")
+
 ### Download GoogleTest
 include(FetchContent)
 FetchContent_Declare(
diff --git a/src/applications/adaptive.cpp b/src/applications/adaptive.cpp
index a622a0a3..32bd3a24 100644
--- a/src/applications/adaptive.cpp
+++ b/src/applications/adaptive.cpp
@@ -114,10 +114,12 @@ int main(int argc, char* argv[]) {
   std::string problem, domain;
   size_t initial_refines = 0;
   size_t max_dofs = 0;
+  size_t num_threads = 1;
   bool calculate_condition_numbers = false;
   bool print_centers = false;
   bool print_sampling = false;
-  bool print_time_apply = false;
+  bool print_time_apply = true;
+  bool print_bilforms = false;
   std::vector<double> print_time_slices;
   boost::program_options::options_description problem_optdesc(
       "Problem options");
@@ -133,7 +135,9 @@ int main(int argc, char* argv[]) {
       "print_sampling", po::value<bool>(&print_sampling))(
       "print_time_slices",
       po::value<std::vector<double>>(&print_time_slices)->multitoken())(
-      "print_time_apply", po::value<bool>(&print_time_apply));
+      "print_time_apply", po::value<bool>(&print_time_apply))(
+      "print_bilforms", po::value<bool>(&print_bilforms))(
+      "num_threads", po::value<size_t>(&num_threads));
 
   std::sort(print_time_slices.begin(), print_time_slices.end());
 
@@ -165,10 +169,20 @@ int main(int argc, char* argv[]) {
   po::store(po::command_line_parser(argc, argv).options(cmdline_options).run(),
             vm);
   po::notify(vm);
+  assert(num_threads > 0);
+  if (num_threads > 1 && adapt_opts.use_cache) {
+    std::cout << "Multithreading is only enabled for no-cache." << std::endl;
+    return 1;
+  }
+  assert(num_threads <= omp_get_max_threads());
+  assert(num_threads <= MAX_NUMBER_THREADS);
+  omp_set_num_threads(num_threads);
+
   std::cout << "Problem options:" << std::endl;
   std::cout << "\tProblem: " << problem << std::endl;
   std::cout << "\tDomain: " << domain
             << "; initial-refines: " << initial_refines << std::endl;
+  std::cout << "\tNumber-threads: " << num_threads << std::endl;
   std::cout << std::endl;
   std::cout << adapt_opts << std::endl;
 
@@ -215,9 +229,13 @@ int main(int argc, char* argv[]) {
 
     // A slight overestimate.
     ndof_Xd = vec_Xd->Bfs().size();
+    size_t ndof_Xd_time = vec_Xd->Project_0()->Bfs().size();
+    size_t ndof_Xd_space = vec_Xd->Project_1()->Bfs().size();
     size_t ndof_Xdd = heat_eq.vec_Xdd()->Bfs().size();
     size_t ndof_Ydd = heat_eq.vec_Ydd()->Bfs().size();
     std::cout << "iter: " << ++iter << "\n\tXDelta-size: " << ndof_Xd
+              << "\n\tXDelta-space-size: " << ndof_Xd_space
+              << "\n\tXDelta-time-size: " << ndof_Xd_time
               << "\n\tXDelta-Gradedness: "
               << vec_Xd->Gradedness(&max_gradedness)
               << "\n\tXDeltaDelta-size: " << ndof_Xdd
@@ -317,18 +335,25 @@ int main(int argc, char* argv[]) {
 
     if (print_time_apply) {
       auto heat_d_dd = heat_eq.heat_d_dd();
-      std::cout << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply()
-                << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply()
-                << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply()
-                << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply()
-                << "\n\tP_Y-time-per-apply: "
-                << heat_d_dd->P_Y()->TimePerApply()
-                << "\n\tP_X-time-per-apply: "
-                << heat_d_dd->P_X()->TimePerApply()
-                << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply()
-                << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply()
-                << "\n\ttotal-time-construct: "
-                << heat_d_dd->TotalTimeConstruct() << std::flush;
+      std::cout
+          << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply()
+          << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply()
+          << "\n\tB-A-time-per-apply: " << heat_d_dd->B()->A()->TimePerApply()
+          << "\n\tB-B-time-per-apply: " << heat_d_dd->B()->B()->TimePerApply()
+          << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply()
+          << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply()
+          << "\n\tP_Y-time-per-apply: " << heat_d_dd->P_Y()->TimePerApply()
+          << "\n\tP_X-time-per-apply: " << heat_d_dd->P_X()->TimePerApply()
+          << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply()
+          << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply()
+          << "\n\ttotal-time-construct: " << heat_d_dd->TotalTimeConstruct()
+          << std::flush;
+    }
+    if (print_bilforms) {
+      auto heat_d_dd = heat_eq.heat_d_dd();
+      std::cout << "\n\tB-A-bilforms: " << heat_d_dd->B()->A()->Information()
+                << "\n\tP_Y-bilforms: " << heat_d_dd->P_Y()->Information()
+                << std::flush;
     }
 
     if (print_centers) {
diff --git a/src/applications/uniform.cpp b/src/applications/uniform.cpp
index 4d52bc37..5761d727 100644
--- a/src/applications/uniform.cpp
+++ b/src/applications/uniform.cpp
@@ -53,10 +53,11 @@ int main(int argc, char* argv[]) {
   size_t initial_refines = 0;
   size_t max_level = 0;
   size_t max_dofs = 0;
+  size_t num_threads = 1;
   std::string refine;
   bool calculate_condition_PY = false;
   bool calculate_condition_PX = false;
-  bool print_time_apply = false;
+  bool print_time_apply = true;
   bool print_centers = false;
   double solve_rtol = 1e-5;
   boost::program_options::options_description problem_optdesc(
@@ -70,6 +71,7 @@ int main(int argc, char* argv[]) {
           ->default_value(std::numeric_limits<std::size_t>::max()))(
       "max_dofs", po::value<size_t>(&max_dofs)->default_value(
                       std::numeric_limits<std::size_t>::max()))(
+      "num_threads", po::value<size_t>(&num_threads))(
       "refine", po::value<std::string>(&refine)->default_value("sparse"))(
       "print_centers", po::value<bool>(&print_centers))(
       "print_time_apply", po::value<bool>(&print_time_apply))(
@@ -118,6 +120,10 @@ int main(int argc, char* argv[]) {
   std::cout << adapt_opts << "\tsolve-rtol: " << solve_rtol << std::endl
             << std::endl;
 
+  assert(num_threads > 0 && num_threads <= omp_get_max_threads() &&
+         num_threads <= MAX_NUMBER_THREADS);
+  omp_set_num_threads(num_threads);
+
   auto T = InitialTriangulation(domain, initial_refines);
   auto B = Time::Bases();
   auto vec_Xd = std::make_shared<
@@ -229,18 +235,19 @@ int main(int argc, char* argv[]) {
 
     if (print_time_apply) {
       auto heat_d_dd = heat_eq.heat_d_dd();
-      std::cout << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply()
-                << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply()
-                << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply()
-                << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply()
-                << "\n\tP_Y-time-per-apply: "
-                << heat_d_dd->P_Y()->TimePerApply()
-                << "\n\tP_X-time-per-apply: "
-                << heat_d_dd->P_X()->TimePerApply()
-                << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply()
-                << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply()
-                << "\n\ttotal-time-construct: "
-                << heat_d_dd->TotalTimeConstruct() << std::flush;
+      std::cout
+          << "\n\tA-time-per-apply: " << heat_d_dd->A()->TimePerApply()
+          << "\n\tB-time-per-apply: " << heat_d_dd->B()->TimePerApply()
+          << "\n\tB-A-time-per-apply: " << heat_d_dd->B()->A()->TimePerApply()
+          << "\n\tB-B-time-per-apply: " << heat_d_dd->B()->B()->TimePerApply()
+          << "\n\tBT-time-per-apply: " << heat_d_dd->BT()->TimePerApply()
+          << "\n\tG-time-per-apply: " << heat_d_dd->G()->TimePerApply()
+          << "\n\tP_Y-time-per-apply: " << heat_d_dd->P_Y()->TimePerApply()
+          << "\n\tP_X-time-per-apply: " << heat_d_dd->P_X()->TimePerApply()
+          << "\n\tS-time-per-apply: " << heat_d_dd->S()->TimePerApply()
+          << "\n\ttotal-time-apply: " << heat_d_dd->TotalTimeApply()
+          << "\n\ttotal-time-construct: " << heat_d_dd->TotalTimeConstruct()
+          << std::flush;
     }
 
     if (print_centers) {
diff --git a/src/datastructures/boost.hpp b/src/datastructures/boost.hpp
index a14afec6..0ddc15ed 100644
--- a/src/datastructures/boost.hpp
+++ b/src/datastructures/boost.hpp
@@ -3,6 +3,7 @@
 #include <boost/container/options.hpp>
 #include <boost/container/small_vector.hpp>
 #include <boost/container/static_vector.hpp>
+#include <vector>
 template <typename I, size_t N>
 using SmallVector = boost::container::small_vector<I, N>;
 
diff --git a/src/datastructures/multi_tree_view.ipp b/src/datastructures/multi_tree_view.ipp
index 452e39c1..20fdf709 100644
--- a/src/datastructures/multi_tree_view.ipp
+++ b/src/datastructures/multi_tree_view.ipp
@@ -87,13 +87,14 @@ std::vector<I*> MultiNodeViewInterface<I, T...>::Union(
     // Now do the union magic in all dimensions.
     static_for<dim>([&queue, &my_node, &other_node, &call_filter](auto i) {
       // Get a list of all children of the other_node in axis `i`.
-      static std::vector<I_other*> filtered_children;
+      static thread_local std::vector<I_other*> filtered_children;
       filtered_children.clear();
       for (const auto& other_child_i : other_node->children(i))
         if (call_filter(other_child_i))
           filtered_children.emplace_back(other_child_i);
 
-      static std::vector<std::tuple_element_t<i, TupleNodes>> other_children_i;
+      static thread_local std::vector<std::tuple_element_t<i, TupleNodes>>
+          other_children_i;
       other_children_i.clear();
       for (const auto& other_child_i : filtered_children)
         other_children_i.emplace_back(std::get<i>(other_child_i->nodes()));
diff --git a/src/datastructures/tree.hpp b/src/datastructures/tree.hpp
index 45bbbd1c..c70b48be 100644
--- a/src/datastructures/tree.hpp
+++ b/src/datastructures/tree.hpp
@@ -1,5 +1,8 @@
 #pragma once
+#include <omp.h>
+
 #include <algorithm>
+#include <array>
 #include <memory>
 #include <queue>
 #include <utility>
@@ -17,6 +20,12 @@ using T_func_noop = decltype(func_noop);
 using T_func_true = decltype(func_true);
 using T_func_false = decltype(func_false);
 
+// Global variable holding the current thread number.
+#ifndef MAX_NUMBER_THREADS
+#define MAX_NUMBER_THREADS 1
+#endif
+static thread_local int thread_number = omp_get_thread_num();
+
 template <typename I>
 struct NodeTrait;  // This should define N_children and N_parents.
 
@@ -38,8 +47,8 @@ class Node {
   }
 
   int level() const { return level_; }
-  bool marked() const { return marked_; }
-  void set_marked(bool value) { marked_ = value; }
+  bool marked() const { return marked_[thread_number]; }
+  void set_marked(bool value) { marked_[thread_number] = value; }
   bool is_leaf() const { return children_.size() == 0; }
   inline bool is_metaroot() const { return (level_ == -1); }
   const auto &parents() const { return parents_; }
@@ -48,20 +57,19 @@ class Node {
   // General data field for universal storage.
   template <typename T>
   T *data() {
-    assert(data_ != nullptr);
-    return static_cast<T *>(data_);
+    assert(data_[thread_number] != nullptr);
+    return static_cast<T *>(data_[thread_number]);
   }
-
   template <typename T>
   void set_data(T *value) {
-    assert(data_ == nullptr);
-    data_ = static_cast<void *>(value);
+    assert(data_[thread_number] == nullptr);
+    data_[thread_number] = static_cast<void *>(value);
   }
   void reset_data() {
-    assert(data_ != nullptr);
-    data_ = nullptr;
+    assert(data_[thread_number] != nullptr);
+    data_[thread_number] = nullptr;
   }
-  bool has_data() { return data_ != nullptr; }
+  bool has_data() { return data_[thread_number] != nullptr; }
 
   template <typename Func = T_func_noop>
   std::vector<I *> Bfs(bool include_metaroot = false,
@@ -93,9 +101,9 @@ class Node {
   }
 
  protected:
-  bool marked_ = false;
   int level_;
-  void *data_ = nullptr;
+  std::array<unsigned short, MAX_NUMBER_THREADS> marked_{0};
+  std::array<void *, MAX_NUMBER_THREADS> data_{nullptr};
 
   // Store children/parents as raw pointers.
   SmallVector<I *, NodeTrait<I>::N_children> children_;
diff --git a/src/space/CMakeLists.txt b/src/space/CMakeLists.txt
index 473fcbfe..dc0df2dd 100644
--- a/src/space/CMakeLists.txt
+++ b/src/space/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(space STATIC triangulation.cpp initial_triangulation.cpp basis.cpp operators.cpp triangulation_view.cpp linear_form.cpp integration.cpp)
+target_link_libraries(space PUBLIC OpenMP::OpenMP_CXX)
 
 # Executables
 add_executable(space_adaptive adaptive.cpp)
diff --git a/src/space/operators.cpp b/src/space/operators.cpp
index bcaf33df..2bcb838a 100644
--- a/src/space/operators.cpp
+++ b/src/space/operators.cpp
@@ -213,13 +213,13 @@ void CGInverse<ForwardOp>::ApplySingleScale(Eigen::VectorXd &vec_SS) const {
 
 // Define the class variables.
 template <typename ForwardOp>
-std::vector<std::vector<std::pair<uint, double>>>
+thread_local std::vector<std::vector<std::pair<uint, double>>>
     MultigridPreconditioner<ForwardOp>::row_mat;
 template <typename ForwardOp>
-std::vector<std::vector<Element2D *>>
+thread_local std::vector<std::vector<Element2D *>>
     MultigridPreconditioner<ForwardOp>::patches;
 template <typename ForwardOp>
-std::vector<std::vector<uint>>
+thread_local std::vector<std::vector<uint>>
     MultigridPreconditioner<ForwardOp>::vertices_relaxation;
 
 template <typename ForwardOp>
@@ -346,12 +346,12 @@ void MultigridPreconditioner<ForwardOp>::ApplySingleScale(
   // Shortcut.
   const uint V = triang_.V;
 
-  // Reuse a static variable for storing the corrections.
-  static std::vector<double> e;
+  // Reuse a static variable for storing the row of a matrix.
+  static thread_local std::vector<double> e;
   e.reserve(V * 3);
 
   // Reuse a static variable for storing the residual in the downard cycle.
-  static std::vector<double> r_down;
+  static thread_local std::vector<double> r_down;
   r_down.reserve(V * 3);
 
   // Initialize the multigrid matrix (row_mat).
diff --git a/src/space/operators.hpp b/src/space/operators.hpp
index 49c1e0ee..d6aeae9a 100644
--- a/src/space/operators.hpp
+++ b/src/space/operators.hpp
@@ -221,9 +221,9 @@ class MultigridPreconditioner : public BackwardOperator {
   DirectInverse<ForwardOp> initial_triang_solver_;
 
   // (Static) variables reused for calculation of the multigrid matrix.
-  static std::vector<std::vector<std::pair<uint, double>>> row_mat;
-  static std::vector<std::vector<Element2D *>> patches;
-  static std::vector<std::vector<uint>> vertices_relaxation;
+  static thread_local std::vector<std::vector<std::pair<uint, double>>> row_mat;
+  static thread_local std::vector<std::vector<Element2D *>> patches;
+  static thread_local std::vector<std::vector<uint>> vertices_relaxation;
 };
 
 template <template <typename> class InverseOp>
diff --git a/src/spacetime/bilinear_form.hpp b/src/spacetime/bilinear_form.hpp
index ced4d695..c4238725 100644
--- a/src/spacetime/bilinear_form.hpp
+++ b/src/spacetime/bilinear_form.hpp
@@ -57,6 +57,8 @@ class BilinearForm
   auto sigma() { return sigma_; }
   auto theta() { return theta_; }
 
+  std::string Information() final;
+
  protected:
   // References to in/out vectors.
   DblVecIn *vec_in_;
@@ -72,6 +74,7 @@ class BilinearForm
   // Debug information.
   using BilinearFormBase<DblVecIn, DblVecOut>::time_construct_;
   using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_;
+  using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_split_;
   using BilinearFormBase<DblVecIn, DblVecOut>::num_apply_;
 
   // Define frozen templates, useful for storing the bil forms.
@@ -87,6 +90,14 @@ class BilinearForm
   std::vector<Time::BilinearForm<OperatorTime, FI<0>, FO<0>>> bil_time_low_;
   std::vector<Time::BilinearForm<OperatorTime, FI<0>, FO<0>>> bil_time_upp_;
   std::vector<space::BilinearForm<OperatorSpace, FO<1>, FO<1>>> bil_space_upp_;
+
+  // Store ordering for spatial parallism.
+  std::vector<FI<0> *> sigma_proj_0_;
+  std::vector<FO<1> *> theta_proj_1_;
+  std::vector<FO<0> *> vec_out_proj_0_;
+  std::vector<FO<1> *> vec_out_proj_1_;
+  std::vector<size_t> ordering_sigma_;
+  std::vector<size_t> ordering_vec_out_;
 };
 
 // Helper functions.
@@ -146,6 +157,7 @@ class BlockDiagonalBilinearForm
   Eigen::VectorXd Apply(const Eigen::VectorXd &v) final;
   DblVecIn *vec_in() const final { return vec_in_; }
   DblVecOut *vec_out() const final { return vec_out_; }
+  std::string Information() final;
 
  protected:
   bool use_cache_;
@@ -157,6 +169,7 @@ class BlockDiagonalBilinearForm
   // Debug information.
   using BilinearFormBase<DblVecIn, DblVecOut>::time_construct_;
   using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_;
+  using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_split_;
   using BilinearFormBase<DblVecIn, DblVecOut>::num_apply_;
 
   // The (cached) bilinear forms.
@@ -164,6 +177,9 @@ class BlockDiagonalBilinearForm
   using FI = datastructures::FrozenDoubleNode<
       datastructures::DoubleNodeVector<BasisTimeIn, BasisSpace>, i>;
   std::vector<space::BilinearForm<OperatorSpace, FI<1>, FI<1>>> space_bilforms_;
+
+  std::vector<FI<0> *> vec_out_proj_0_;
+  std::vector<size_t> ordering_;
 };
 
 template <typename OpSpace, typename BTimeIn, typename BTimeOut>
diff --git a/src/spacetime/bilinear_form.ipp b/src/spacetime/bilinear_form.ipp
index 6ac4a37a..aaaf8578 100644
--- a/src/spacetime/bilinear_form.ipp
+++ b/src/spacetime/bilinear_form.ipp
@@ -29,7 +29,11 @@ BilinearForm<OperatorTime, OperatorSpace, BasisTimeIn, BasisTimeOut>::
       sigma_(sigma),
       theta_(theta),
       use_cache_(use_cache),
-      space_opts_(std::move(space_opts)) {
+      space_opts_(std::move(space_opts)),
+      vec_out_proj_0_(vec_out_->Project_0()->Bfs()),
+      vec_out_proj_1_(vec_out_->Project_1()->Bfs()),
+      sigma_proj_0_(sigma_->Project_0()->Bfs()),
+      theta_proj_1_(theta_->Project_1()->Bfs()) {
   auto time_start = std::chrono::steady_clock::now();
 #ifdef VERBOSE
   std::cerr << std::left;
@@ -83,11 +87,74 @@ BilinearForm<OperatorTime, OperatorSpace, BasisTimeIn, BasisTimeOut>::
       auto fiber_out = psi_out_labda->FrozenOtherAxis();
       bil_space_upp_.emplace_back(fiber_in, fiber_out, space_opts_);
     }
+  } else {
+    std::vector<size_t> sizes(vec_out_proj_0_.size());
+    ordering_vec_out_.resize(vec_out_proj_0_.size());
+#pragma omp parallel for schedule(dynamic, 1)
+    for (int i = 0; i < vec_out_proj_0_.size(); ++i) {
+      sizes[i] =
+          std::max(vec_out_proj_0_[i]->FrozenOtherAxis()->Bfs().size(),
+                   theta_->Fiber_1(vec_out_proj_0_[i]->node())->Bfs().size());
+      ordering_vec_out_[i] = i;
+    }
+    std::sort(ordering_vec_out_.begin(), ordering_vec_out_.end(),
+              [&sizes](int i, int j) { return sizes[i] > sizes[j]; });
+
+    sizes.resize(sigma_proj_0_.size());
+    ordering_sigma_.resize(sigma_proj_0_.size());
+#pragma omp parallel for schedule(dynamic, 1)
+    for (int i = 0; i < sigma_proj_0_.size(); ++i) {
+      sizes[i] =
+          std::max(sigma_proj_0_[i]->FrozenOtherAxis()->Bfs().size(),
+                   vec_in_->Fiber_1(sigma_proj_0_[i]->node())->Bfs().size());
+      ordering_sigma_[i] = i;
+    }
+    std::sort(ordering_sigma_.begin(), ordering_sigma_.end(),
+              [&sizes](int i, int j) { return sizes[i] > sizes[j]; });
   }
+
   time_construct_ = std::chrono::duration<double>(
       std::chrono::steady_clock::now() - time_start);
 }
 
+template <template <typename, typename> class OperatorTime,
+          typename OperatorSpace, typename BasisTimeIn, typename BasisTimeOut>
+std::string BilinearForm<OperatorTime, OperatorSpace, BasisTimeIn,
+                         BasisTimeOut>::Information() {
+  std::stringstream result;
+  result << "([";
+  for (auto psi_in_labda : sigma_->Project_0()->Bfs()) {
+    auto fiber_in = vec_in_->Fiber_1(psi_in_labda->node());
+    auto fiber_out = psi_in_labda->FrozenOtherAxis();
+    result << "(" << fiber_in->Bfs().size() << "," << fiber_out->Bfs().size()
+           << "),";
+  }
+  result << "],[";
+  for (auto psi_out_labda : vec_out_->Project_1()->Bfs()) {
+    auto fiber_in = sigma_->Fiber_0(psi_out_labda->node());
+    auto fiber_out = psi_out_labda->FrozenOtherAxis();
+    result << "(" << fiber_in->Bfs().size() << "," << fiber_out->Bfs().size()
+           << "),";
+  }
+  result << "],[";
+  for (auto psi_in_labda : theta_->Project_1()->Bfs()) {
+    auto fiber_in = vec_in_->Fiber_0(psi_in_labda->node());
+    auto fiber_out = psi_in_labda->FrozenOtherAxis();
+    result << "(" << fiber_in->Bfs().size() << "," << fiber_out->Bfs().size()
+           << "),";
+  }
+  result << "],[";
+  // Calculate R_Lambda(Id x A2)I_Theta.
+  for (auto psi_out_labda : vec_out_->Project_0()->Bfs()) {
+    auto fiber_in = theta_->Fiber_1(psi_out_labda->node());
+    auto fiber_out = psi_out_labda->FrozenOtherAxis();
+    result << "(" << fiber_in->Bfs().size() << "," << fiber_out->Bfs().size()
+           << "),";
+  }
+  result << "])";
+  return result.str();
+}
+
 template <template <typename, typename> class OperatorTime,
           typename OperatorSpace, typename BasisTimeIn, typename BasisTimeOut>
 Eigen::VectorXd BilinearForm<OperatorTime, OperatorSpace, BasisTimeIn,
@@ -104,71 +171,109 @@ Eigen::VectorXd BilinearForm<OperatorTime, OperatorSpace, BasisTimeIn,
   // Store the input in the double tree.
   vec_in_->FromVectorContainer(v_in);
 
+  // clang-format off
   // Check whether we have to recalculate the bilinear forms.
   if (!use_cache_) {
-    // Calculate R_sigma(Id x A_1)I_Lambda.
-    for (auto psi_in_labda : sigma_->Project_0()->Bfs()) {
-      auto fiber_in = vec_in_->Fiber_1(psi_in_labda->node());
-      auto fiber_out = psi_in_labda->FrozenOtherAxis();
-      if (fiber_out->children().empty()) continue;
-      auto bil_form = space::CreateBilinearForm<OperatorSpace>(
-          fiber_in, fiber_out, space_opts_);
-      bil_form.Apply();
-    }
-
-    // Calculate R_Lambda(L_0 x Id)I_Sigma.
-    for (auto psi_out_labda : vec_out_->Project_1()->Bfs()) {
-      auto fiber_in = sigma_->Fiber_0(psi_out_labda->node());
-      if (fiber_in->children().empty()) continue;
-      auto fiber_out = psi_out_labda->FrozenOtherAxis();
-      auto bil_form =
-          Time::CreateBilinearForm<OperatorTime>(fiber_in, fiber_out);
-      bil_form.ApplyLow();
-    }
-
-    // Store the lower output.
-    v_lower = vec_out_->ToVectorContainer();
+    // Execute the rest parallel.
+    #pragma omp parallel
+    {
+      // Calculate R_sigma(Id x A_1)I_Lambda.
+      auto time_compute = std::chrono::steady_clock::now();
+      #pragma omp for schedule(dynamic, 1)
+      for (int j = 0; j < sigma_proj_0_.size(); ++j) {
+        int i = ordering_sigma_[j];
+        auto psi_in_labda = sigma_proj_0_[i];
+        auto fiber_in = vec_in_->Fiber_1(psi_in_labda->node());
+        auto fiber_out = psi_in_labda->FrozenOtherAxis();
+        if (fiber_out->children().empty()) continue;
+        auto bil_form = space::CreateBilinearForm<OperatorSpace>(
+            fiber_in, fiber_out, space_opts_);
+        bil_form.Apply();
+      }
+      if (omp_get_thread_num() == 0)
+        time_apply_split_[0] += std::chrono::duration<double>(
+                std::chrono::steady_clock::now() - time_compute);
+
+      // Calculate R_Lambda(L_0 x Id)I_Sigma.
+      time_compute = std::chrono::steady_clock::now();
+      #pragma omp for schedule(guided)
+      for (int j = 0; j < vec_out_proj_1_.size(); ++j) {
+        int i = vec_out_proj_1_.size() - j - 1;
+        auto psi_out_labda = vec_out_proj_1_[i];
+        auto fiber_in = sigma_->Fiber_0(psi_out_labda->node());
+        if (fiber_in->children().empty()) continue;
+        auto fiber_out = psi_out_labda->FrozenOtherAxis();
+        auto bil_form =
+            Time::CreateBilinearForm<OperatorTime>(fiber_in, fiber_out);
+        bil_form.ApplyLow();
+      }
+      if (omp_get_thread_num() == 0)
+        time_apply_split_[1] += std::chrono::duration<double>(
+                std::chrono::steady_clock::now() - time_compute);
+
+      #pragma omp single
+      {
+        // Store the lower output.
+        v_lower = vec_out_->ToVectorContainer();
+
+        // Reset the input, if necessary.
+        if (vec_in_ == sigma_.get() ||
+            static_cast<void *>(vec_in_) == static_cast<void *>(vec_out_))
+          vec_in_->FromVectorContainer(v_in);
+      }
+
+      // Calculate R_Theta(U_1 x Id)I_Lambda.
+      time_compute = std::chrono::steady_clock::now();
+      #pragma omp for schedule(guided)
+      for (int j = 0; j < theta_proj_1_.size(); ++j) {
+        int i = theta_proj_1_.size() - 1 - j;
+        auto psi_in_labda = theta_proj_1_[i];
+        auto fiber_in = vec_in_->Fiber_0(psi_in_labda->node());
+        auto fiber_out = psi_in_labda->FrozenOtherAxis();
+        if (fiber_out->children().empty()) continue;
+        auto bil_form =
+            Time::CreateBilinearForm<OperatorTime>(fiber_in, fiber_out);
+        bil_form.ApplyUpp();
+      }
+      if (omp_get_thread_num() == 0)
+        time_apply_split_[2] += std::chrono::duration<double>(
+                std::chrono::steady_clock::now() - time_compute);
+
+      // Calculate R_Lambda(Id x A2)I_Theta.
+      time_compute = std::chrono::steady_clock::now();
+      #pragma omp for schedule(dynamic, 1)
+      for (int j = 0; j < vec_out_proj_0_.size(); ++j) {
+        int i = ordering_vec_out_[j];
+        auto psi_out_labda = vec_out_proj_0_[i];
+        auto fiber_in = theta_->Fiber_1(psi_out_labda->node());
+        if (fiber_in->children().empty()) continue;
+        auto fiber_out = psi_out_labda->FrozenOtherAxis();
+        auto bil_form = space::CreateBilinearForm<OperatorSpace>(
+            fiber_in, fiber_out, space_opts_);
+        bil_form.Apply();
+      }
+      if (omp_get_thread_num() == 0)
+        time_apply_split_[3] += std::chrono::duration<double>(
+                std::chrono::steady_clock::now() - time_compute);
 
-    // Reset the input, if necessary.
-    if (vec_in_ == sigma_.get() ||
-        static_cast<void *>(vec_in_) == static_cast<void *>(vec_out_))
-      vec_in_->FromVectorContainer(v_in);
-
-    // Calculate R_Theta(U_1 x Id)I_Lambda.
-    for (auto psi_in_labda : theta_->Project_1()->Bfs()) {
-      auto fiber_in = vec_in_->Fiber_0(psi_in_labda->node());
-      auto fiber_out = psi_in_labda->FrozenOtherAxis();
-      if (fiber_out->children().empty()) continue;
-      auto bil_form =
-          Time::CreateBilinearForm<OperatorTime>(fiber_in, fiber_out);
-      bil_form.ApplyUpp();
-    }
-
-    // Calculate R_Lambda(Id x A2)I_Theta.
-    for (auto psi_out_labda : vec_out_->Project_0()->Bfs()) {
-      auto fiber_in = theta_->Fiber_1(psi_out_labda->node());
-      if (fiber_in->children().empty()) continue;
-      auto fiber_out = psi_out_labda->FrozenOtherAxis();
-      auto bil_form = space::CreateBilinearForm<OperatorSpace>(
-          fiber_in, fiber_out, space_opts_);
-      bil_form.Apply();
     }
   } else {
     // Apply the lower part using cached bil forms.
-    for (auto &bil_form : bil_space_low_) bil_form.Apply();
-    for (auto &bil_form : bil_time_low_) bil_form.ApplyLow();
+    for (int i = 0; i < bil_space_low_.size(); ++i) bil_space_low_[i].Apply();
+    for (int i = 0; i < bil_time_low_.size(); ++i) bil_time_low_[i].ApplyLow();
 
     // Store the lower output.
     v_lower = vec_out_->ToVectorContainer();
-
+  
     // Reset the input, if necessary.
     if (vec_in_ == sigma_.get() ||
         static_cast<void *>(vec_in_) == static_cast<void *>(vec_out_))
       vec_in_->FromVectorContainer(v_in);
 
     // Apply the upper part using cached bil forms.
-    for (auto &bil_form : bil_time_upp_) bil_form.ApplyUpp();
-    for (auto &bil_form : bil_space_upp_) bil_form.Apply();
+    for (int i = 0; i < bil_time_upp_.size(); ++i) bil_time_upp_[i].ApplyUpp();
+    for (int i = 0; i < bil_space_upp_.size(); ++i) bil_space_upp_[i].Apply();
+
   }
 
   // Return vectorized output.
@@ -230,12 +335,13 @@ BlockDiagonalBilinearForm<OperatorSpace, BasisTimeIn, BasisTimeOut>::
     : vec_in_(vec_in),
       vec_out_(vec_out),
       use_cache_(use_cache),
-      space_opts_(std::move(space_opts)) {
+      space_opts_(std::move(space_opts)),
+      vec_out_proj_0_(vec_out->Project_0()->Bfs()) {
   auto time_start = std::chrono::steady_clock::now();
   assert(vec_in->container().size() == vec_out->container().size());
   // If use cache, cache the bil forms here.
   if (use_cache_) {
-    for (auto psi_out_labda : vec_out_->Project_0()->Bfs()) {
+    for (auto psi_out_labda : vec_out_proj_0_) {
       auto fiber_in = vec_in_->Fiber_1(psi_out_labda->node());
       if (fiber_in->children().empty()) continue;
       auto fiber_out = psi_out_labda->FrozenOtherAxis();
@@ -243,11 +349,39 @@ BlockDiagonalBilinearForm<OperatorSpace, BasisTimeIn, BasisTimeOut>::
       space_opts_.time_level = std::get<0>(psi_out_labda->nodes())->level();
       space_bilforms_.emplace_back(fiber_in, fiber_out, space_opts_);
     }
+  } else {
+    std::vector<size_t> sizes(vec_out_proj_0_.size());
+    ordering_.resize(vec_out_proj_0_.size());
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (int i = 0; i < vec_out_proj_0_.size(); ++i)  {
+        sizes[i] = vec_out_proj_0_[i]->FrozenOtherAxis()->Bfs().size();
+        ordering_[i] = i;
+    }
+    std::sort(ordering_.begin(), ordering_.end(), [&sizes](int i, int j) {
+            return sizes[i] > sizes[j];
+            });
   }
   time_construct_ = std::chrono::duration<double>(
       std::chrono::steady_clock::now() - time_start);
 }
 
+template <typename OperatorSpace, typename BasisTimeIn, typename BasisTimeOut>
+std::string BlockDiagonalBilinearForm<OperatorSpace, BasisTimeIn, BasisTimeOut>::Information()
+{
+  std::stringstream result;
+  result << "[";
+  for (int j = 0; j < vec_out_proj_0_.size(); ++j) {
+    int i = ordering_[j];
+    auto psi_out_labda = vec_out_proj_0_[i];
+    auto fiber_in = vec_in_->Fiber_1(psi_out_labda->node());
+    auto fiber_out = psi_out_labda->FrozenOtherAxis();
+    result << "(" << fiber_in->Bfs().size() << "," << fiber_out->Bfs().size()
+           << "),";
+  }
+  result << "]";
+  return result.str();
+}
+
 template <typename OperatorSpace, typename BasisTimeIn, typename BasisTimeOut>
 Eigen::VectorXd
 BlockDiagonalBilinearForm<OperatorSpace, BasisTimeIn, BasisTimeOut>::Apply(
@@ -263,14 +397,18 @@ BlockDiagonalBilinearForm<OperatorSpace, BasisTimeIn, BasisTimeOut>::Apply(
   vec_in_->FromVectorContainer(v_in);
 
   if (!use_cache_) {
-    for (auto psi_out_labda : vec_out_->Project_0()->Bfs()) {
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (int j = 0; j < vec_out_proj_0_.size(); ++j) {
+      int i = ordering_[j];
+      auto psi_out_labda = vec_out_proj_0_[i];
       auto fiber_in = vec_in_->Fiber_1(psi_out_labda->node());
       if (fiber_in->children().empty()) continue;
       auto fiber_out = psi_out_labda->FrozenOtherAxis();
       // Set the level of the time wavelet.
-      space_opts_.time_level = std::get<0>(psi_out_labda->nodes())->level();
+      space::OperatorOptions space_opts{space_opts_};
+      space_opts.time_level = std::get<0>(psi_out_labda->nodes())->level();
       auto bil_form = space::CreateBilinearForm<OperatorSpace>(
-          fiber_in, fiber_out, space_opts_);
+          fiber_in, fiber_out, space_opts);
       bil_form.Apply();
     }
   } else {
diff --git a/src/spacetime/bilinear_form_linalg.hpp b/src/spacetime/bilinear_form_linalg.hpp
index 0d5197c1..ef821707 100644
--- a/src/spacetime/bilinear_form_linalg.hpp
+++ b/src/spacetime/bilinear_form_linalg.hpp
@@ -56,16 +56,27 @@ class BilinearFormBase
   }
 
   double TimeApply() const { return time_apply_.count(); };
-  double TimePerApply() const {
-    if (num_apply_ == 0) return 0;
-    return time_apply_.count() / num_apply_;
+  const auto &TimeApplySplit() const { return time_apply_split_; }
+  std::string TimePerApply() const {
+    int num_apply = num_apply_;
+    if (num_apply == 0) num_apply = -1;
+
+    std::stringstream result;
+    result << "(" << time_apply_.count() / num_apply;
+    for (const auto &time : time_apply_split_)
+      result << "," << time.count() / num_apply;
+    result << ")";
+    return result.str();
   };
   double TimeConstruct() const { return time_construct_.count(); }
 
+  virtual std::string Information() { assert(false); }
+
  protected:
   // Timing debug information.
   std::chrono::duration<double> time_construct_{0};
   std::chrono::duration<double> time_apply_{0};
+  std::array<std::chrono::duration<double>, 4> time_apply_split_{};
   size_t num_apply_ = 0;
 };
 
@@ -131,6 +142,8 @@ class SumBilinearForm : public BilinearFormBase<typename BilFormA::DblVecIn,
     // Store timing results.
     time_apply_ += std::chrono::duration<double>(
         std::chrono::steady_clock::now() - time_start);
+    for (size_t i = 0; i < time_apply_split_.size(); i++)
+      time_apply_split_[i] = a_->TimeApplySplit()[i] + b_->TimeApplySplit()[i];
 
     return result;
   }
@@ -139,6 +152,9 @@ class SumBilinearForm : public BilinearFormBase<typename BilFormA::DblVecIn,
   auto sigma() { return a_->sigma(); }
   auto theta() { return a_->theta(); }
 
+  std::shared_ptr<BilFormA> A() { return a_; }
+  std::shared_ptr<BilFormB> B() { return b_; }
+
   auto Transpose() {
     auto a_t = a_->Transpose();
     auto b_t = b_->Transpose();
@@ -152,6 +168,7 @@ class SumBilinearForm : public BilinearFormBase<typename BilFormA::DblVecIn,
   std::shared_ptr<BilFormB> b_;
 
   using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_;
+  using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_split_;
   using BilinearFormBase<DblVecIn, DblVecOut>::num_apply_;
 };
 
@@ -216,6 +233,10 @@ class SchurBilinearForm
     // Store timing results.
     time_apply_ += std::chrono::duration<double>(
         std::chrono::steady_clock::now() - time_start);
+    for (size_t i = 0; i < time_apply_split_.size(); i++)
+      time_apply_split_[i] = b_->TimeApplySplit()[i] +
+                             a_inv_->TimeApplySplit()[i] +
+                             bt_->TimeApplySplit()[i] + g_->TimeApplySplit()[i];
 
     return result;
   }
@@ -230,6 +251,7 @@ class SchurBilinearForm
   std::shared_ptr<G> g_;
 
   using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_;
+  using BilinearFormBase<DblVecIn, DblVecOut>::time_apply_split_;
   using BilinearFormBase<DblVecIn, DblVecOut>::num_apply_;
 };
 }  // namespace spacetime
diff --git a/src/spacetime/bilinear_form_performance.cpp b/src/spacetime/bilinear_form_performance.cpp
index 0f3c7dde..5ee9aa31 100644
--- a/src/spacetime/bilinear_form_performance.cpp
+++ b/src/spacetime/bilinear_form_performance.cpp
@@ -19,12 +19,19 @@ using namespace space;
 using namespace Time;
 using namespace datastructures;
 
-constexpr int level = 10;
+constexpr int num_threads = 4;
+constexpr int level = 15;
 constexpr int bilform_iters = 5;
 constexpr int inner_iters = 10;
 constexpr bool use_cache = true;
 
 int main() {
+  omp_set_num_threads(num_threads);
+#pragma omp parallel
+  {
+    // Code inside this region runs in parallel.
+    std::cout << "Hello from thread " << omp_get_thread_num() << std::endl;
+  }
   auto B = Time::Bases();
   auto T = InitialTriangulation::UnitSquare();
   T.hierarch_basis_tree.UniformRefine(::level);
@@ -42,6 +49,8 @@ int main() {
         DoubleTreeVector<ThreePointWaveletFn, HierarchicalBasisFn>>();
     auto vec_Y = Y_delta.template DeepCopy<
         DoubleTreeVector<OrthonormalWaveletFn, HierarchicalBasisFn>>();
+    vec_X.ComputeFibers();
+    vec_Y.ComputeFibers();
     auto bil_form =
         CreateBilinearForm<Time::TransportOperator, space::MassOperator>(
             &vec_X, &vec_Y, /* use_cache */ use_cache);
diff --git a/src/time/CMakeLists.txt b/src/time/CMakeLists.txt
index 7461c915..b1ef9464 100644
--- a/src/time/CMakeLists.txt
+++ b/src/time/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_library(time STATIC basis.cpp haar_basis.cpp orthonormal_basis.cpp three_point_basis.cpp hierarchical_basis.cpp integration.cpp)
+target_link_libraries(time PUBLIC OpenMP::OpenMP_CXX)
 
 add_executable(bilinear_form_performance bilinear_form_performance.cpp)
 target_link_libraries(bilinear_form_performance time BoostProgramOptions)
diff --git a/src/time/orthonormal_basis.cpp b/src/time/orthonormal_basis.cpp
index a16a4e68..a1e74041 100644
--- a/src/time/orthonormal_basis.cpp
+++ b/src/time/orthonormal_basis.cpp
@@ -45,29 +45,33 @@ double DiscLinearScalingFn::Eval(double t, bool deriv) const {
 
 bool DiscLinearScalingFn::Refine() {
   if (is_full()) return false;
-  assert(children_.empty());
-  support_[0]->Refine();
-  auto [l, n] = labda();
-  auto P = std::vector{this, nbr_};
-  auto child_elts = support_[0]->children();
-  make_child(
-      /* parents */ P, /* index */ 2 * n + 0,
-      /* support */ std::vector{child_elts[0]});
-  make_child(
-      /* parents */ P, /* index */ 2 * n + 1,
-      /* support */ std::vector{child_elts[0]});
-  make_child(
-      /* parents */ P, /* index */ 2 * n + 2,
-      /* support */ std::vector{child_elts[1]});
-  make_child(
-      /* parents */ P, /* index */ 2 * n + 3,
-      /* support */ std::vector{child_elts[1]});
 
-  nbr_->children_ = children_;
-  children_[0]->nbr_ = children_[1];
-  children_[1]->nbr_ = children_[0];
-  children_[2]->nbr_ = children_[3];
-  children_[3]->nbr_ = children_[2];
+#pragma omp critical
+  if (!is_full()) {
+    assert(children_.empty());
+    support_[0]->Refine();
+    auto [l, n] = labda();
+    auto P = std::vector{this, nbr_};
+    auto child_elts = support_[0]->children();
+    make_child(
+        /* parents */ P, /* index */ 2 * n + 0,
+        /* support */ std::vector{child_elts[0]});
+    make_child(
+        /* parents */ P, /* index */ 2 * n + 1,
+        /* support */ std::vector{child_elts[0]});
+    make_child(
+        /* parents */ P, /* index */ 2 * n + 2,
+        /* support */ std::vector{child_elts[1]});
+    make_child(
+        /* parents */ P, /* index */ 2 * n + 3,
+        /* support */ std::vector{child_elts[1]});
+
+    nbr_->children_ = children_;
+    children_[0]->nbr_ = children_[1];
+    children_[1]->nbr_ = children_[0];
+    children_[2]->nbr_ = children_[3];
+    children_[3]->nbr_ = children_[2];
+  }
   return true;
 }
 
diff --git a/src/time/three_point_basis.cpp b/src/time/three_point_basis.cpp
index 18689d86..f1a5f055 100644
--- a/src/time/three_point_basis.cpp
+++ b/src/time/three_point_basis.cpp
@@ -34,58 +34,62 @@ double ContLinearScalingFn::EvalMother(double t, bool deriv) const {
 
 ContLinearScalingFn *ContLinearScalingFn::RefineMiddle() {
   if (child_middle_) return child_middle_;
-  auto [l, n] = labda();
-  for (auto elem : support_) elem->Refine();
+#pragma omp critical
+  if (!child_middle_) {
+    auto [l, n] = labda();
+    for (auto elem : support_) elem->Refine();
 
-  std::vector<Element1D *> child_support;
-  if (n > 0) child_support.push_back(support_[0]->children()[1]);
-  if (n < (1LL << l)) child_support.push_back(support_.back()->children()[0]);
+    std::vector<Element1D *> child_support;
+    if (n > 0) child_support.push_back(support_[0]->children()[1]);
+    if (n < (1LL << l)) child_support.push_back(support_.back()->children()[0]);
 
-  // Create child, and add accordingly.
-  child_middle_ = make_child(
-      /* parents */ std::vector{this},
-      /* index */ 2 * n,
-      /* support */ child_support);
+    // Create child, and add accordingly.
+    child_middle_ = make_child(
+        /* parents */ std::vector{this},
+        /* index */ 2 * n,
+        /* support */ child_support);
 
-  if (child_left_) {
-    child_left_->nbr_right_ = child_middle_;
-    child_middle_->nbr_left_ = child_left_;
-  }
-  if (child_right_) {
-    child_right_->nbr_left_ = child_middle_;
-    child_middle_->nbr_right_ = child_right_;
+    if (child_left_) {
+      child_left_->nbr_right_ = child_middle_;
+      child_middle_->nbr_left_ = child_left_;
+    }
+    if (child_right_) {
+      child_right_->nbr_left_ = child_middle_;
+      child_middle_->nbr_right_ = child_right_;
+    }
   }
-
   return child_middle_;
 }
 
 ContLinearScalingFn *ContLinearScalingFn::RefineLeft() {
   assert(nbr_left_);
   if (child_left_) return child_left_;
-  support_[0]->Refine();
-  auto [l, n] = labda();
-
-  // Create child, and add accordingly.
-  auto elems = support_[0]->children();
-  child_left_ = make_child(
-      /* parents */ std::vector{nbr_left_, this},
-      /* index */ (2 * n - 1),
-      /* support */ std::vector{elems[0], elems[1]});
-
-  // Add this child to our left neighbour.
-  nbr_left_->child_right_ = child_left_;
-  nbr_left_->children_.push_back(child_left_);
-
-  // Update neighbours of children.
-  if (nbr_left_->child_middle_) {
-    nbr_left_->child_middle_->nbr_right_ = child_left_;
-    child_left_->nbr_left_ = nbr_left_->child_middle_;
-  }
-  if (child_middle_) {
-    child_middle_->nbr_left_ = child_left_;
-    child_left_->nbr_right_ = child_middle_;
-  }
+#pragma omp critical
+  if (!child_left_) {
+    support_[0]->Refine();
+    auto [l, n] = labda();
 
+    // Create child, and add accordingly.
+    auto elems = support_[0]->children();
+    child_left_ = make_child(
+        /* parents */ std::vector{nbr_left_, this},
+        /* index */ (2 * n - 1),
+        /* support */ std::vector{elems[0], elems[1]});
+
+    // Add this child to our left neighbour.
+    nbr_left_->child_right_ = child_left_;
+    nbr_left_->children_.push_back(child_left_);
+
+    // Update neighbours of children.
+    if (nbr_left_->child_middle_) {
+      nbr_left_->child_middle_->nbr_right_ = child_left_;
+      child_left_->nbr_left_ = nbr_left_->child_middle_;
+    }
+    if (child_middle_) {
+      child_middle_->nbr_left_ = child_left_;
+      child_left_->nbr_right_ = child_middle_;
+    }
+  }
   return child_left_;
 }